diff --git a/.gitignore b/.gitignore index 08a66913..3d4c0094 100644 --- a/.gitignore +++ b/.gitignore @@ -12,9 +12,14 @@ .tags* .deps *.pdf +*.tt +*.out # Ignore IDE files /.idea/ /nbproject/ reports/ +traces/ +users/ +saved_traces/ \ No newline at end of file diff --git a/Kconfig b/Kconfig new file mode 100644 index 00000000..16fec3fd --- /dev/null +++ b/Kconfig @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ +# +# Homa transport protocol +# + +menuconfig HOMA + tristate "The Homa transport protocol" + depends on INET + depends on IPV6 + + help + Homa is a network transport protocol for communication within + a datacenter. It provides significantly lower latency than TCP, + particularly for workloads containing a mixture of large and small + messages operating at high network utilization. At present, Homa + has been only partially upstreamed; this version provides bare-bones + functionality but is not performant. For more information see the + homa(7) man page or checkout the Homa Wiki at + https://homa-transport.atlassian.net/wiki/spaces/HOMA/overview. + + If unsure, say N. diff --git a/Makefile b/Makefile index 9ad3c40c..d68e49b8 100644 --- a/Makefile +++ b/Makefile @@ -1,34 +1,96 @@ # Makefile to build Homa as a Linux module. +HOMA_OBJS := homa_devel.o \ + homa_incoming.o \ + homa_interest.o \ + homa_outgoing.o \ + homa_peer.o \ + homa_pool.o \ + homa_plumbing.o \ + homa_rpc.o \ + homa_sock.o \ + homa_timer.o \ + homa_utils.o \ + timetrace.o + +ifneq ($(__STRIP__),) +MY_CFLAGS += -D__STRIP__ +else +HOMA_OBJS += homa_grant.o \ + homa_metrics.o \ + homa_offload.o \ + homa_pacer.o \ + homa_qdisc.o \ + homa_skb.o +endif + +CHECK_SRCS := $(patsubst %.o,%.c,$(filter-out homa_devel.o timetrace.o, $(HOMA_OBJS))) +CHECK_SRCS += $(filter-out homa_receiver.h homa_devel.h, $(wildcard *.h)) + +ifneq ($(KERNEL_SRC),) +# alternatively to variable KDIR accept variable KERNEL_SRC as used in +# PetaLinux/Yocto for example +KDIR ?= $(KERNEL_SRC) +endif + +LINUX_VERSION ?= $(shell uname -r) +KDIR ?= /lib/modules/$(LINUX_VERSION)/build + +LINUX_SRC_DIR ?= ../net-next + +ifneq ($(KERNELRELEASE),) + obj-m += homa.o -homa-y = homa_incoming.o \ - homa_offload.o \ - homa_outgoing.o \ - homa_peertab.o \ - homa_pool.o \ - homa_plumbing.o \ - homa_socktab.o \ - homa_timer.o \ - homa_utils.o \ - timetrace.o +homa-y = $(HOMA_OBJS) MY_CFLAGS += -g -ccflags-y += ${MY_CFLAGS} -CC += ${MY_CFLAGS} +ccflags-y += $(MY_CFLAGS) -KDIR ?= /lib/modules/$(shell uname -r)/build +else all: - make -C $(KDIR) M=$(PWD) modules + $(MAKE) -C $(KDIR) M=$(shell pwd) modules install: - make -C $(KDIR) M=$(PWD) modules_install + $(MAKE) -C $(KDIR) M=$(shell pwd) modules_install + +kdoc: + $(LINUX_SRC_DIR)/scripts/kernel-doc -none $(CHECK_SRCS) -check: - ../homaLinux/scripts/kernel-doc -none *.c +checkpatch: + $(LINUX_SRC_DIR)/scripts/checkpatch.pl --file --strict --codespell $(CHECK_SRCS) + +# Copy stripped source files to a Linux source tree +HOMA_TARGET ?= $(LINUX_SRC_DIR)/net/homa +CP_HDRS := homa_impl.h \ + homa_interest.h \ + homa_peer.h \ + homa_pool.h \ + homa_rpc.h \ + homa_sock.h \ + homa_stub.h \ + homa_wire.h \ + murmurhash3.h +CP_SRCS := $(patsubst %.o,%.c,$(filter-out homa_devel.o homa_grant.o \ + homa_metrics.o homa_offload.o homa_pacer.o homa_qdisc.o \ + homa_skb.o timetrace.o, $(HOMA_OBJS))) +CP_EXTRAS := Kconfig \ + Makefile +CP_TARGETS := $(patsubst %,$(HOMA_TARGET)/%,$(CP_HDRS) $(CP_SRCS) $(CP_EXTRAS)) +net-next: $(CP_TARGETS) $(LINUX_SRC_DIR)/include/uapi/linux/homa.h +$(HOMA_TARGET)/%: % util/strip.py + util/strip.py $< > $@ +$(HOMA_TARGET)/%.txt: %.txt + cp $< $@ +$(HOMA_TARGET)/Makefile: Makefile.upstream + cp $< $@ +$(HOMA_TARGET)/strip_decl.py: util/strip_decl.py + cp $< $@ +$(LINUX_SRC_DIR)/include/uapi/linux/homa.h: homa.h util/strip.py + util/strip.py $< > $@ clean: - make -C $(KDIR) M=$(PWD) clean + $(MAKE) -C $(KDIR) M=$(shell pwd) clean # The following targets are useful for debugging Makefiles; they # print the value of a make variable in one of several contexts. @@ -36,7 +98,9 @@ print-%: @echo $* = $($*) printBuild-%: - make -C $(KDIR) M=$(PWD) $@ + $(MAKE) -C $(KDIR) M=$(shell pwd) $@ printClean-%: - make -C $(KDIR) M=$(PWD) $@ + $(MAKE) -C $(KDIR) M=$(shell pwd) $@ + +endif diff --git a/Makefile.upstream b/Makefile.upstream new file mode 100644 index 00000000..1e02be7f --- /dev/null +++ b/Makefile.upstream @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ +# +# Makefile for the Linux implementation of the Homa transport protocol. + +obj-$(CONFIG_HOMA) := homa.o +homa-y:= homa_incoming.o \ + homa_interest.o \ + homa_outgoing.o \ + homa_peer.o \ + homa_plumbing.o \ + homa_pool.o \ + homa_rpc.o \ + homa_sock.o \ + homa_timer.o \ + homa_utils.o diff --git a/README.md b/README.md index fb4a353c..d1ea532c 100644 --- a/README.md +++ b/README.md @@ -19,23 +19,23 @@ This repo contains an implementation of the Homa transport protocol as a Linux k - The incast optimization from Section 3.6 of the SIGCOMM paper has not been implemented yet. If you would like to test Homa under large incasts, let me know and I will implement this feature. - - Socket buffer memory management needs more work. Large numbers of large - messages (hundreds of MB?) may cause buffer exhaustion and deadlock. - Please contact me if you have any problems using this repo; I'm happy to provide advice and support. -- The head is known to work under Linux 5.17.5 and 5.18. In the past - it has run under 5.4.3 and 4.15.18; you can access these versions with branches - named linux_5.4.3 and linux_4.15.18. These older branches are out of date - feature-wise: recent commits have not been back-ported to them. +- The head is known to work under Linux 6.17.8. In the past, Homa has + run under several earlier versions of Linux. There is a separate branch + for each of these + older versions, with names such as linux_4.15.18. Older branches are + out of date feature-wise: recent commits have not been back-ported to them. Other versions of Linux have not been tested and - may require code changes (the upgrade from 4.15.18 to 5.4.3 took only about - a day). If you get Homa working on some other version, please submit a - pull request for the required code changes. + may require code changes (these upgrades rarely long). If you get Homa + working on some other version, please submit a + pull request with the required code changes. -- There now exists support for using Homa with gRPC: see the - [GitHub repo](https://github.com/PlatformLab/grpc_homa). +- Related work that you may find useful: + - [Preliminary support for using Homa with gRPC](https://github.com/PlatformLab/grpc_homa) + - [A Go client that works with this module](https://github.com/dpeckett/go-homa) - To build the module, type `make all`; then type `sudo insmod homa.ko` to install it, and `sudo rmmod homa` to remove an installed module. In practice, though, @@ -45,7 +45,7 @@ This repo contains an implementation of the Homa transport protocol as a Linux k invoke it with no parameters to install and configure Homa on the current machine. -- The script `cloudlab/bin/install` will copy relevant Homa files +- The script `cloudlab/bin/install_homa` will copy relevant Homa files across a cluster of machines and configure Homa on each node. It assumes that nodes have names `nodeN` where N is a small integer, and it also assumes that you have already run `make` both in the top-level directory and @@ -122,14 +122,64 @@ This repo contains an implementation of the Homa transport protocol as a Linux k - Homa exports a collection of configuration parameters through the sysctl mechanism. For details, see the man page `homa.7`. -## Significant recent improvements +## Significant changes +- January 2026: introduced new 'homa_qdisc' queuing discpline to improve + performance when TCP and Homa run simultaneously. Results on c6620 CloudLab + cluster (100 Gbps network): + - Without homa_qdisc, if Homa and TCP run together, Homa performance + suffers (4x increase for P99 for short messages) but TCP performance + improves. + - Homa_qdisc improves performance for both Homa and TCP, whether + running stand-alone or together. + - homa_qdisc improves Homa short message P99 3x when running together + with TCP, but P99 is still slower than Homa standalone. + - TCP performance improves when running together with Homa, with or + without homa_qdisc. +- November 2025: upgraded to Linux 6.17.8. +- October 2025: added the HOMAIOCINFO ioctl for retrieving status + information about a Homa socket. See man/homa.7 for details. +- May 2025: `homa_api.c` has been removed, so the functions `homa_abort`, + `homa_reply`, `homa_replyv`, `homa_send`, and `homa_sendv` no longer + exist. +- May 2025: added support for network namespaces. +- May 2025: reworked support for peers to cap peer memory usage. +- April 2025: upgraded to Linux 6.13.9. +- April 2025: major refactoring of grant management (more efficient, + remove complexity that was causing an unending stream of bugs). +- March 2025: added memory cap on memory for outgoing messages: send + requests can block if memory limit is reached. +- March 2025: implemented private RPCs, resulting in API changes. + HOMA_RECVMSG_REQUEST and HOMA_RECVMSG_RESPONSE flags no longer exist and + struct homa_sendmsg_args now has a flags field with one defined + flag: HOMA_SENDMSG_PRIVATE. +- February 2025: by default, incoming requests for a socket are rejected + unless the socket has been bound. setsockopt can be used with + SO_HOMA_SERVER to enable or disable incoming requests for any socket. +- October 2024: the process of upstreaming Homa into the Linux kernel has + begun. The reviewing process is likely to result in API changes. + Upstreaming will occur in stages, so the first version to appear in Linux + will not be either functionally complete or performant. The sources in + this repository contain '#ifndef __STRIP__' directives, which + separate functionality being upstreamed from functionality that is not + currently upstreamed (some things, such as development aids, + may never be upstreamed). +- October 2024: Homa now has an official IANA IP protocol number (146). +- August 2024: upgraded to Linux 6.10.6. +- July 2024: introduced "TCP hijacking", where Homa packets are sent as + legitimate TCP segments (using TCP as the IP protocol) and then reclaimed + from TCP on the destination. This allows Homa to make better use of + TSO and RSS. +- June 2024: refactored sk_buff management to use frags; improves + efficiency significantly. +- April 2024: replaced `master` branch with `main` +- July 2023: upgraded to Linux 6.1.38. - December 2022: Version 2.0. This includes a new mechanism for managing buffer space for incoming messages, which improves throughput by 50-100% in many situations. In addition, Homa now uses the sendmsg and recvmsg system calls, rather than ioctls, for sending and receiving messages. The API for receiving messages is incompatible with 1.01. -- November 2022: Implemented software GSO for Homa. -- September 2022: Added support for IPv6, as well as completion cookies. +- November 2022: implemented software GSO for Homa. +- September 2022: added support for IPv6, as well as completion cookies. This required small but incompatible changes to the API. Many thanks to Dan Manjarres for contributing these improvements. diff --git a/balance.txt b/balance.txt new file mode 100644 index 00000000..6295a0bd --- /dev/null +++ b/balance.txt @@ -0,0 +1,117 @@ +This file discusses the issue of load-balancing in Homa. + +In order to keep up with fast networks, transport protocols must distribute +their processing across multiple cores. For outgoing packets this happens +naturally: sending threads run on different cores and packet processing +for outbound packets happens on the same core is the sending thread. Things +are more difficult for incoming packets. In general, an incoming packet +will pass through 3 cores: +* NAPI/GRO: the NIC distributes incoming packets across cores using RSS. + The number of incoming channels, and their association with cores, can + be configured in software. The NIC will then distribute packets across + those channels using a hash based on packet header fields. The device + driver receives packets as part of NAPI, then packets are collected into + batches using GRO and handed off to SoftIRQ. +* SoftIRQ processing occurs on a (potentially) different core from NAPI/GRO; + the network stack runs here, including Homa's main handlers for incoming + packets. The system default is to compute another hash function on packet + headers to select a SoftIRQ or for a batch, but it is possible for GRO + to make its own choice of core, and Homa does this. +* Once a complete message is received, it is handed off to an application + thread, which typically runs on a different core. + +The load balancing challenge is to distribute load across multiple cores +without overloading any individual core ("hotspots"). This has proven +quite difficult, and hotspots are the primary source of tail latency in Homa. +The most common cause of hotspots is when 2 or more of the above tasks +are assigned to the same core. For example: +* Two batches from different NAPI/GRO cores might get assigned to the same + SoftIRQ core. +* A particular core might be very busy handling NAPI/GRO for a stream of + packets in a large message; this will prevent application threads from + making progress on that core. A short message might pass through other + cores for NAPI/GRO and SoftIRQ, but if its application is running on + the busy core, then it will not able to process the short message. + +Part of the problem is that core assignments are made independently by +3 different schedulers (RSS for the NAPI/GRO core, GRO or the system for +the SoftIRQ core, and the Linux scheduler for the application core), +so conflicts are likely to occur. Only one of these schedulers is under +control of the transport protocol. + +It's also important to note that using more cores isn't always the best +approach. For example, if a node is lightly loaded, it would be best to +do all RX processing on a single core: using multiple cores causes extra +cache misses as data migrates from core to core, and it also adds latency +to pass control between cores. In an ideal world, the number of cores used for +protocol processing would be just enough to keep any of them from getting +overloaded. However, it appears to be hard to vary the number of cores +without risking overloads; except in a few special cases, Homa doesn't do +this. + +Homa tries to use its control over SoftIRQ scheduling to minimize hotspots. +Several different approaches have been tried over time; this document +focuses on the two most recent ones, which are called "Gen2" and "Gen3". + +Gen2 Load Balancing +------------------- +* Gen2 assumes that NAPI/GRO processing is occurring on all cores. +* When GRO chooses where to assign a batch of packets for SoftIRQ, it + considers the next several cores (in ascending circular core order + after the GRO core). +* GRO uses several criteria to try to find a "good" core for SoftIRQ, such + as avoiding a core that has done recent GRO processing, or one for which + there is already pending SoftIRQ work. +* Selection stops as soon as it finds a "good" core. +* If no "good" core is found, then GRO will rotate among the successor + cores on a batch-by-batch basis. +* In some cases, Gen2 will bypass the SoftIRQ handoff mechanism and simply + run SoftIRQ immediately on its core. This is done in two cases: short + packets and grant packets. Bypass is particularly useful for grants + because it eliminates the latency associated with a handoff, and grant + turnaround time is important for overall performance. + +Gen2 has several problems: +* It doesn't do anything about the problem of application threads conflicting + with NAPI/GRO or SoftIRQ. +* A single core may be assigned both SoftIRQ and NAPI/GRO work at the + same time. +* The SoftIRQ core groups for different NAPI/GRO cores overlap, so it's + possible for multiple GROs to schedule batches to the same SoftIRQ core. +* When receiving packets from a large message, Gen2 tends to alternate between + 2 or more SoftIRQ cores, which results in unnecessary cache coherency + traffic. +* If the NAPI/GRO core is overloaded, bypass can make things worse (especially + since grant processing results in transmitting additional packets, which + is fairly expensive). + +Gen3 Load Balancing +------------------- +The Gen3 load-balancing mechanism is an attempt to solve the problems +associated with Gen2. +* The number of channels is reduced, so that only 1/4 of the cores do + NAPI/GRO processing. This appears to be sufficient capacity to avoid + overloads on any of the NAPI/GRO cores. +* Each NAPI/GRO core has 3 other cores (statically assigned) that it can use + for SoftIRQ processing. The SoftIRQ core groups for different NAPI/GRO + cores do not overlap. This means that SoftIRQ and GRO will never happen + simultaneously on the same core, and there will be no conflicts between + the SoftIRQ groups of different NAPI/GRO cores. +* Gen3 takes steps to avoid core conflicts between application threads and + NAPI/GRO and SoftIRQ processing, as described below. +* When an application thread is using Homa actively on a core, the core + is marked as "busy". When GRO selects a SoftIRQ core, it attempts to + avoid cores that are busy with application threads. If there is a choice + of un-busy cores, GRO will try to reuse a single SoftIRQ over and over. +* Homa also keeps track of recent NAPI/GRO and SoftIRQ processing on each + core. When an incoming message becomes ready and there are multiple threads + waiting for messages, Homa tries to pick a thread whose core has not had + recent Homa activity. +* Between these two mechanisms, the hope is that SoftIRQ and application + work will adjust their core assignments to avoid conflicts. + +Gen3 was implemented in November of 2023; so far its performance appears to be +about the same as Gen2 (slightly worse for W2 and W3, slightly better for W5). +Gen3 performance on W3 appears highly variable: P99 latency can vary by 5-10x +from run to run; as of December 2023 the reasons for this have not been +determined. \ No newline at end of file diff --git a/cloudlab/bashrc b/cloudlab/bashrc index bb26ad40..dd7fffcb 100644 --- a/cloudlab/bashrc +++ b/cloudlab/bashrc @@ -2,10 +2,12 @@ # see /usr/share/doc/bash/examples/startup-files (in the package bash-doc) # for examples -PATH=/opt/gradle-7.3/bin:/ouster/install/bin:~/homaModule/util:~/homaModule/perf:~/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +PATH=/opt/gradle-7.3/bin:/$USER/install/bin:~/homaModule/util:~/homaModule/perf:~/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib/x86_64-linux-gnu +export PYTHONPATH=/users/$USER/homaModule/util:/users/$USER/bin + export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 # If not running interactively, don't do anything diff --git a/cloudlab/bin/ckill b/cloudlab/bin/ckill index 011d4f24..815efe4f 100755 --- a/cloudlab/bin/ckill +++ b/cloudlab/bin/ckill @@ -1,18 +1,7 @@ #!/bin/bash -# Copyright (c) 2020-2023 Stanford University -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE +# Copyright (c) 2020-2023 Homa Developers +# SPDX-License-Identifier: BSD-1-Clause # Kill processes with a given name on a cluster of machines. # diff --git a/cloudlab/bin/config b/cloudlab/bin/config index 95cef8cf..5ccf574f 100755 --- a/cloudlab/bin/config +++ b/cloudlab/bin/config @@ -1,18 +1,7 @@ #!/usr/bin/python3 -# Copyright (c) 2023 Stanford University -# -# Permission to use, copy, modify, and distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2023 Homa Developers +# SPDX-License-Identifier: BSD-1-Clause """ This script is used to configure CloudLab nodes for running Homa experiments. @@ -21,17 +10,31 @@ parameters, and configuring the system in various ways such as setting power management parameters and enabling RPS. """ +import fcntl +import functools from glob import glob import os import re import subprocess import sys +import time + +from switch import * # Names for the main network interface and VLAN for this host. Set by # get_interfaces interface = None vlan = None +# Cached result of get_cpu_type. +cpu_type = None + +# Cached result of get_link_speed (integer Mbits/sec.) +link_mbps = None + +# Cached result of get_nic_type. +nic_type = None + # Number of this node (e.g. 1 for "node1"). Set by get_node_num. node_num = None @@ -39,6 +42,15 @@ node_num = None # Set by get_node_type. node_type = None +# Number of nodes in the experiment (0 means uninitialized). +num_nodes = 0 + +# Cached result of get_node_names. +node_names = [] + +# Set from command line option to request detailed output. +verbose = False + # Contains information from /proc/cpuinfo with one entry for each # "processor" in /proc/cpuinfo. The entry is a dictionary with the # following entries (same names as in /proc/cpuinfo): @@ -53,16 +65,69 @@ cpu_info = [] # in cpuinfo). Filled in by read_cpu_info(). num_phys_cores = 0 -def get_core_mask(): +# Describes the the full channel mappings for xl170 machines at CloudLab. +# Entry i is the core that will handle incoming packets on channel i. +xl170_default_cores = [0, 10, 7, 19, 8, 18, 6, 16, 4, 14, + 11, 1, 7, 17, 15, 5, 13, 3, 2, 12] + +# False means that the installed version of Homa doesn't support sysctls. +sysctl_avl = True + +def get_core_mask(core = -1): """ - Returns a hexadecimal integer mask with a 1 bit for each core - (hyperthread) in this machine. + Returns a string representing a bit mask containing one bit for each + core. The mask is represented with one or more 32-bit hex values: + "val,val,...val". If the core argument is -1, then all of the bits are + ones (i.e. all cores are selected). If the core argument is >=0, then + all of the bits will be zeros except for the one selecting the indicated + core. + + core: Either -1 (to select all cores) or the number of a single core + to select.. """ read_cpu_info() - mask = 0 + mask_words = [0] for cpu in cpu_info: - mask = mask | (1 << cpu["processor"]) - return "%x" % (mask) + cur_core = cpu["processor"] + word = core//32 + while word >= len(mask_words): + mask_words.append(0); + if (core == -1) or (cur_core == core): + mask_words[word] |= 1<<(cur_core & 0x1f); + result = "" + while len(mask_words) > 0: + if len(result) != 0: + result += ","; + result += ("%x" % (mask_words[-1])) + mask_words.pop() + return result + +def get_cpu_type(): + """ + Return information about the processor we're running on (the "model name" + from /proc/cpuinfo) + """ + + global cpu_type + if cpu_type == None: + read_cpu_info() + if cpu_type == None: + raise Exception("Couldn't find 'model name' line in /proc/cpuinfo") + return cpu_type + +def get_exp_ports(): + """ + Return a list containing all of the switch egress port numbers used by + nodes in the current CloudLab experiment. + """ + + ports = [] + for name in get_node_names(): + match = re.match('[^0-9]+([0-9]+)', name) + if not match: + raise Exception("bad host name %s: expected number at end" % (name)) + ports.append(int(match.group(1)) % 40) + return ports def get_interfaces(): """ @@ -76,22 +141,19 @@ def get_interfaces(): if interface: return [interface, vlan] available = "" - for line in subprocess.run(["ifconfig"], stdout=subprocess.PIPE, - encoding="utf-8", check=True).stdout.splitlines(): + for line in exec_cmd(["ifconfig"], check=True).stdout.splitlines(): match = re.match('^([a-z0-9]*):', line) if match: current = match.group(1) if available: available += ", " available += current - match = re.match('^([a-z]*1f1[a-z0-9]*):', line) - if match: - interface = match.group(1) - match = re.match('^([a-z]*1f0[a-z0-9]*):', line) - if match: - interface = match.group(1) + if (('s1f1' in current) or ('s1f0' in current) + or ('s0f0' in current) or ('s0f1' in current) + or (current == 'eno1')) and not interface: + interface = current continue - if re.match('^[ ]+ inet 10\.0\.1\.', line): + if re.match(r'^[ ]+ inet 10\.0\.1\.', line): vlan = current if not vlan or not interface: print("Found the following interfaces: %s" % (available)) @@ -102,6 +164,66 @@ def get_interfaces(): print("Primary network interface is %s, vlan is %s" % (interface, vlan)) return [interface, vlan] +def get_link_speed(): + """ + Return the link speed for the primary NIC, in Mbits/sec. + """ + + global link_mbps + if link_mbps != None: + return link_mbps + nic = get_interfaces()[0] + num_channels = -1 + + for line in exec_cmd(["ethtool", nic], check=True).stdout.splitlines(): + match = re.match('.*Speed: ([0-9]+)Mb/s', line) + if match: + link_mbps = int(match.group(1)) + print("Link speed for %s is %d Mbps" % (nic, link_mbps)) + return link_mbps + raise Exception("Couldn't find link speed in ethtool output") + +def get_nic_irqs(): + """ + Returns a list containing the IRQ numbers for NIC channels (entry + 0 in the list corresponds to channel 0, and so on). + """ + irqs = {} + f = open("/proc/interrupts") + for line in f: + match = re.match('([0-9]+): .* mlx5_comp([0-9]+)@pci:0000:03:00.1', line) + if not match: + continue + irq = int(match.group(1)) + channel = int(match.group(2)) + irqs[channel] = irq + result = [] + for i in range(len(irqs)): + if not i in irqs: + raise Exception('Couldn\'t find IRQ for NIC channel %d' % (i)) + result.append(irqs[i]) + f.close() + return result + +def get_nic_type(): + """ + Returns the type of driver for the primary NIC, such as "ice" for + the Intel driver. + """ + + global nic_type + if nic_type != None: + return nic_type + info = exec_cmd(["sudo", "ethtool", "-i", get_interfaces()[0]], + check=True).stdout + match = re.search(r'.*driver:\s*(\S+)', info, re.MULTILINE) + if not match: + raise Exception("Couldn't identify NIC type (no 'driver' info in " + "ethtool output") + nic_type = match.group(1) + print("Driver type for primary network interface is %s" % (nic_type)) + return nic_type + def get_node_num(): """ Returns the (integer) number of this node (e.g., 1 for "node1"). @@ -110,9 +232,8 @@ def get_node_num(): global node_num if node_num != None: return node_num - hostname = subprocess.run(["hostname"], stdout=subprocess.PIPE, - encoding="utf-8", check=True).stdout - match = re.match('node([0-9]+)\.', hostname) + hostname = exec_cmd(["hostname"], check=True).stdout + match = re.match(r'node([0-9]+)\.', hostname) if not match: raise Exception("Couldn't figure out node number for this node") node_num = int(match.group(1)) @@ -121,7 +242,8 @@ def get_node_num(): def get_node_type(): """ - Returns the node type for this machine. + Returns the node type for this machine (assumes we're running on a + Cloudlab machine). """ global node_type @@ -133,12 +255,108 @@ def get_node_type(): print("Type of this node is %s" % (node_type)) return node_type +def get_node_names(): + """ + Return a list, in order of node number (node0 first), containing the + name of each node in the experiment, such as hp083. + """ + global node_names + if len(node_names) > 0: + return node_names + node_map = {} + f = open("/var/emulab/boot/ltpmap") + for line in f: + match = re.search('H node([0-9]+) ([^ ]+)', line) + if match: + num = int(match.group(1)) + node_map[int(match.group(1))] = match.group(2) + f.close() + if len(node_map) != get_num_nodes(): + raise Exception("get_node_names found only %d nodes, but expected %d" + % (len(node_map), get_num_nodes())) + node_names = [] + for i in range(len(node_map)): + if not i in node_map: + raise Exception("Couldn't find host name for node%d" % (i)) + node_names.append(node_map[i]) + return node_names + +def get_num_nodes(): + """ + Returns the number of nodes in this experiment. + """ + + global num_nodes + if num_nodes > 0: + return num_nodes + f = open("/var/emulab/boot/hostmap") + for line in f: + if re.match('node([-0-9.]+)', line): + num_nodes += 1 + f.close() + return num_nodes + +def get_qdisc_config(): + """ + Returns a dictionary with the following keys, which describe the current + configuration of queuing disciplines for the current interface: + root_handle: The handle (e.g. ":0" of the root qdisc for the interface) + children: A list of dictionaries describing the queue-specific qdiscs: + type: The type of the qdisc (e.g. fg_codel or homa) + handle: The handle for that qdisc + queue: The queue number that the qdisc is associated with + (e.g. if the parent handle is "1:4", the queue + will be 4) + """ + + result = {} + result['children'] = [] + nic = get_interfaces()[0] + for line in exec_cmd(['tc', 'qdisc', 'show', 'dev', nic], + check=True).stdout.splitlines(): + match = re.match('qdisc mq ([0-9a-f]+:[0-9a-f]*) root', line) + if match: + result['root_handle'] = match.group(1) + match = re.match('qdisc ([^ ]+) ([0-9a-f]+:[0-9a-f]*) ' + 'parent [^:]*:([0-9a-f]+)', line); + if match: + type = match.group(1) + handle = match.group(2) + queue = int(match.group(3), 16) + result['children'].append({'type': type, 'handle': handle, + 'queue': queue}) + return result + +def print_rss(): + """ + Print out the current RSS configuration (how many channels and which + cores will handle them) + """ + + nic = get_interfaces()[0] + irqs = get_nic_irqs() + num_channels = -1 + for line in exec_cmd(["ethtool", "-l", nic], + check=True).stdout.splitlines(): + match = re.match('Combined:[^0-9]+([0-9]+)', line) + if match: + num_channels = int(match.group(1)) + if num_channels <= 0: + raise Exception('Couldn\'t get number of channels from ethtool') + print('Number of NIC channels: %d' % (num_channels)) + print('Channel Core') + for i in range(num_channels): + f = open('/proc/irq/%s/smp_affinity_list' % (irqs[i])) + core = int(f.read()) + f.close() + print('%-4d %4d' % (i, core)) + def read_cpu_info(): """ Read the file /proc/cpuinfo and store information from it in various global arrays above. """ - global cpu_info, num_phys_cores + global cpu_info, cpu_type, num_phys_cores if len(cpu_info) > 0: return @@ -147,11 +365,14 @@ def read_cpu_info(): sockets = {} f = open("/proc/cpuinfo", "r") for line in f: - match = re.match('([^\t]*)[\t ]+: (.*)', line) + match = re.match(r'([^\t]*)[\t ]+: (.*)', line) if match: name = match.group(1) value = match.group(2) # print("name '%s' value '%s'" % (name, value)) + if name == 'model name' and cpu_type == None: + cpu_type = value + print("CPU type is %s" % (cpu_type)) if name == 'processor': cpu = int(value) cpu_info.append({name: int(value)}) @@ -197,19 +418,52 @@ def add_ipv6_to_etc_hosts(num_hosts): input = "" for i in range(first, num_hosts): input += "fd00::%d node%d\n" % (i+1, i) - subprocess.run(["sudo", "bash", "-c", "cat >> /etc/hosts"], + exec_cmd(["sudo", "bash", "-c", "cat >> /etc/hosts"], input=input, encoding="utf-8", check=True) else: print("/etc/hosts already contains IPv6 addresses for nodes 0-%d" % ( num_hosts-1)) +def exec_cmd(*args, **kwargs): + """ + This method is a wrapper around subprocess.run, which arranges for the + output to be captured and also performs logging. The arguments are the + same as those for subprocess.run. + """ + global verbose + + if verbose: + print("%% %s" % (" ".join(args[0]))) + try: + return subprocess.run(*args, text=True, capture_output=True, **kwargs) + except subprocess.CalledProcessError as e: + if not verbose: + print("%% %s" % (" ".join(args[0]))) + print(e.stderr) + raise + def set_sysctl(name, value): """ Set a Homa sysctl configuration option as given by name and value. """ - subprocess.run(["sudo", "sysctl", ".net.homa.%s=%s" % (name, value)], + global sysctl_avl + + if not sysctl_avl: + return + exec_cmd(["sudo", "sysctl", ".net.homa.%s=%s" % (name, value)], check=True) +def config_ecn_threshold(kb): + """ + Modify the configuration of this experiment's egress ports at the + top-of-rack switch to enable optimal Homa performance. + """ + s = Switch(True) + for port in get_exp_ports(): + print("Configuring ECN threshold for port %d" % (port)) + s.set_ecn_threshold(port, kb) + s.close() + def config_homa(mod): """ Install the Homa kernel driver and configure it appropriately for @@ -218,28 +472,45 @@ def config_homa(mod): """ type = get_node_type() print("Installing Homa kernel module from %s" % (mod)) - subprocess.run(["sudo", "rmmod", "homa"], check=False) - subprocess.run(["sudo", "bash", "-c", "insmod %s" % (mod)], + exec_cmd(["sudo", "rmmod", "homa"], check=False) + exec_cmd(["sudo", "bash", "-c", "insmod %s" % (mod)], check=True) + # See if Homa supports sysctls (if it has been stripped down for Linux + # upstreaming, it might not). + + result = exec_cmd(["sysctl", ".net.homa.num_priorities"], check=False) + if result.returncode != 0: + global sysctl_avl + print("Homa doesn't appear to support sysctls") + sysctl_avl = False + set_sysctl("num_priorities", "8") - if type == "xl170": - set_sysctl("link_mbps", "25000") - set_sysctl("max_nic_queue_ns", "2000") - set_sysctl("rtt_bytes", "150000") + link_mbps = get_link_speed() + set_sysctl ("link_mbps", str(link_mbps)) + set_sysctl("max_nic_est_backlog_usecs", "5") + if link_mbps == 10000: + set_sysctl("unsched_bytes", "30000") + set_sysctl("window", "50000") + set_sysctl("max_incoming", "400000") set_sysctl("max_gso_size", "10000") - elif type == "c6525-100g": - set_sysctl("link_mbps", "100000") - set_sysctl("max_nic_queue_ns", "5000") - set_sysctl("rtt_bytes", "200000") - set_sysctl("max_gso_size", "100000") - elif type == "c6525-25g": - set_sysctl("link_mbps", "25000") - set_sysctl("max_nic_queue_ns", "5000") - set_sysctl("rtt_bytes", "150000") + elif link_mbps == 25000: + set_sysctl("unsched_bytes", "60000") + set_sysctl("window", "100000") + set_sysctl("max_incoming", "480000") set_sysctl("max_gso_size", "10000") + elif link_mbps == 100000: + set_sysctl("unsched_bytes", "60000") + set_sysctl("window", "200000") + set_sysctl("max_incoming", "1600000") + set_sysctl("max_gso_size", "100000") else: - raise Exception("Can't configure Homa: unknown node type %s" % (type)) + raise Exception("Can't configure Homa: no config info available " + "for link speed %d Mbps" % (link_mbps)) + + if get_nic_type() == "ice": + print("Enabling TCP hijacking") + set_sysctl("hijack_tcp", 1) def config_ipv6(num_hosts, vlan): """ @@ -250,25 +521,92 @@ def config_ipv6(num_hosts, vlan): """ vlan = get_interfaces()[1] # Configure ifconfig and route if not already done. - if "inet6 fd00::" in subprocess.run(["ifconfig", vlan], - stdout=subprocess.PIPE, encoding="utf-8", check=True).stdout: + if "inet6 fd00::" in exec_cmd(["ifconfig", vlan], check=True).stdout: print("IPv6 already configured") else: print("Configuring IPv6:") - subprocess.run(["sudo", "ifconfig", vlan, "add", "fd00::%d/64" % ( + exec_cmd(["sudo", "ifconfig", vlan, "add", "fd00::%d/64" % ( get_node_num() + 1)], check=True) - subprocess.run(["sudo", "route", "-6", "add", "fd00::/16", vlan], + exec_cmd(["sudo", "route", "-6", "add", "fd00::/16", vlan], check=True) add_ipv6_to_etc_hosts(num_hosts) +def config_lb(config): + """ + Configures load balancing as specified by config. This includes RSS + (Receive-Side Scaling) and SoftIRQ steering by setting the number of + NIC channels and which cores will handle each. It will also configure + Homa's mechanism for choosing SoftIRQ cores as described in gen3.txt, + and it will set related sysctl parameters such as gro_policy and + gro_busy_usecs. + + config: Indicates which configuration is desired; one of: + xl170_default: Same as Gen2, except with the default boot-time + assignment of network channels to cores + gen2: For Homa Gen2 load balancing: one channel/core + gen3: For Homa Gen3 load balancing: one channel/4 cores + gen3_alt: Gen3 with one channel/2 cores + """ + + read_cpu_info() + + if config == 'xl170_default': + cores = xl170_default_cores + set_sysctl('gro_policy', '114') + set_sysctl('gro_busy_usecs', '0') + elif config == 'gen2': + cores = range(len(cpu_info)) + set_sysctl('gro_policy', '114') + set_sysctl('gro_busy_usecs', '0') + elif config == 'gen3': + cores = [2*i for i in range(len(cpu_info)//4)] + set_sysctl('gro_policy', '226') + set_sysctl('gro_busy_usecs', '2') + elif config == 'gen3_alt': + cores = range(len(cpu_info)//2) + set_sysctl('gro_policy', '226') + set_sysctl('gro_busy_usecs', '2') + else: + raise Exception('Bad load balancing config "%s"; must be ' + 'xl170_default, gen2, gen3, or gen3_alt' % (config)) + + exec_cmd(["sudo", "ethtool", "-L", get_interfaces()[0], "combined", + str(len(cores))], check=True) + + irqs = get_nic_irqs() + half_cores = len(cpu_info)//2 + for i in range(len(cores)): + softirq_cores = [] + core = cores[i] + if config == 'gen3': + # Use every 4th core for GRO, with 3 choices for SoftIRQ + softirq_cores = [2*i + 1, half_cores + 2*i, half_cores + 2*i + 1] + elif config == 'gen3_alt': + # Use every other core for GRO, with only 1 choice for SoftIRQ + softirq_cores = [(half_cores + (core+1)%half_cores), -1, -1] + softirq_msg = '' + if softirq_cores: + softirq_msg = ", SoftIRQ on cores %s" % (softirq_cores) + exec_cmd(["sudo", "sysctl", + ".net.homa.gen3_softirq_cores=%d %d %d %d" % (cores[i], + softirq_cores[0], softirq_cores[1], softirq_cores[2])], + check=True) + + exec_cmd(["sudo", "bash", "-c", + "echo %d > /proc/irq/%s/smp_affinity_list" % (cores[i], irqs[i])], + check=True) + + print("Configured RSS for channel %d (IRQ %d): NAPI on core %d%s" + % (i, irqs[i], cores[i], softirq_msg)) + def config_mtu(size): """ Set the maximum allowable packet length for this node to size. """ [interface, vlan] = get_interfaces() - subprocess.run(["sudo", "ip", "link", "set", interface, "mtu", str(size)], + exec_cmd(["sudo", "ip", "link", "set", interface, "mtu", str(size)], check=True) - subprocess.run(["sudo", "ip", "link", "set", vlan, "mtu", str(size)], + exec_cmd(["sudo", "ip", "link", "set", vlan, "mtu", str(size)], check=True) print("MTU set to %d bytes" % (size)) @@ -278,30 +616,109 @@ def config_nic(): mechanisms). """ interface = get_interfaces()[0] - subprocess.run(["sudo", "ethtool", "-C", interface, "adaptive-rx", "off"], + + # Use a separate ethtool command for each paramemeter. Otherwise, + # if one parameter isn't supported the command will be aborted, + # so no parameters will get set. + print("Configuring NIC to reduce interrupt latency") + exec_cmd(["sudo", "ethtool", "-C", interface, "adaptive-rx", "off"], check=False) - subprocess.run(["sudo", "ethtool", "-C", interface, "rx-usecs", "5", - "rx-frames", "1"], check=False) + exec_cmd(["sudo", "ethtool", "-C", interface, "rx-usecs", "0"], check=False) + exec_cmd(["sudo", "ethtool", "-C", interface, "rx-frames", "1"], check=False) + + if get_nic_type() == "ice": + print("Increasing tx ring size for Intel NIC") + exec_cmd(["sudo", "ethtool", "-G", interface, "tx", "1024"], + check=True) + print("Disabling adaptive-tx for Intel NIC to recover tx buffers faster") + exec_cmd(["sudo", "ethtool", "-C", interface, "adaptive-tx", "off", + "tx-usecs", "10"], check=True) def config_power(): """ Configure the machine's power management for best Homa performance. """ - type = get_node_type() - if type == "xl170": - # Intel E5-2640v4 processor. For Homa, it's best to leave C-states - # enabled. This can cause CPUs to sleep in power-saving mode, but if - # C-states are disabled, then so is Turbo mode, and that will hurt - # peak peformance. + if "intel" in get_cpu_type().lower(): + # For Intel processors, it's best to leave C-states enabled. This + # can cause CPUs to sleep in power-saving mode, but if C-states + # are disabled, then so is Turbo mode, and that will hurt peak peformance. print("Configuring power settings for Intel CPUs") - subprocess.run(["sudo", "cpupower", "frequency-set", "-g", - "performance"], check=True) - elif (type == "c6525-100g") or (type == "c6525-25g"): - # AMD 7402P (EPYC Rome processor); don't know of any appropriate - # power setting changes - return + try: + exec_cmd(["sudo", "cpupower", "frequency-set", "-g", + "performance"], check=True) + except subprocess.CalledProcessError: + print("*** cpupower error; ignoring for now") else: - raise Exception("Can't configure power: unknown node type %s" % (type)) + print("Skipping power settings (non-Intel CPU type)") + +def config_qdisc(): + """ + Install Homa's queuing discipline for all of the queues of the NIC. + This will only work if the Homa version of tc is in the search path. + """ + + nic = get_interfaces()[0] + config = get_qdisc_config() + root = config['root_handle'] + + print('Installing Homa qdisc') + if root == '0:': + # Must reset the root qdisc (it isn't possible to modify the + # default one) + exec_cmd(['sudo', 'tc', 'qdisc', 'add', 'dev', nic, 'root', 'handle', + '1:', 'mq'], check=True) + root = '1:' + + # Replace the qdisc for each NIC queue with a homa one. + for child in config['children']: + if child['handle'] != '0:': + exec_cmd(['sudo', 'tc', 'qdisc', 'del', 'dev', nic, + 'parent', '%s%x' % (root, child['queue']), 'handle', + child['handle']], check=True) + exec_cmd(['sudo', 'tc', 'qdisc', 'add', 'dev', nic, + 'parent', '%s%x' % (root, child['queue']), 'handle', + '%x:' % (64 + child['queue']), 'homa'], check=True) + +def config_reset_qdisc(): + """ + Reset the qdisc configuration by replacing any homa qdiscs with + fq_codel ones (this assumes that fq_codel is the default). + """ + + nic = get_interfaces()[0] + config = get_qdisc_config() + root = config['root_handle'] + + print('Removing Homa qdisc, restoring fq_codel') + for child in config['children']: + if child['type'] != 'homa': + continue + exec_cmd(['sudo', 'tc', 'qdisc', 'del', 'dev', nic, + 'parent', '%s%x' % (root, child['queue']), 'handle', + '%x:' % (64 + child['queue'])], check=True) + exec_cmd(['sudo', 'tc', 'qdisc', 'replace', 'dev', nic, + 'parent', '%s%x' % (root, child['queue']), 'handle', + '%x:' % (64 + child['queue']), 'fq_codel'], check=True) + +def config_reset_switch_all_ports(): + """ + Reset the configuration of all egress ports at the top-of-rack switch + to undo a previous config_all_switch_ports. + """ + s = Switch(True) + s.reset_all_ports() + s.close() + +def config_reset_switch_ports(): + """ + Reset the configuration of the egress ports at the top-of-rack switch + to undo a previous config_switch_ports. + """ + s = Switch(True) + for port in get_exp_ports(): + print("Resetting TOR port for port %d:" % (port)) + s.reset_port(port) + s.close() def config_rps(): """ @@ -310,39 +727,102 @@ def config_rps(): interface = get_interfaces()[0] mask = get_core_mask() - subprocess.run(["sudo", "sysctl", "-w", + exec_cmd(["sudo", "sysctl", "-w", "net.core.rps_sock_flow_entries=32768"], check=True) flow_cnt = 0 for file in glob("/sys/class/net/%s/queues/rx-*/rps_flow_cnt" % (interface)): - subprocess.run(["sudo", "bash", "-c", "echo 2048 > %s" % (file)], + exec_cmd(["sudo", "bash", "-c", "echo 2048 > %s" % (file)], check=True) flow_cnt += 1 cpus = 0 for file in glob("/sys/class/net/%s/queues/rx-*/rps_cpus" % (interface)): - subprocess.run(["sudo", "bash", "-c", "echo %s > %s" % (mask, file)], + exec_cmd(["sudo", "bash", "-c", "echo %s > %s" % (mask, file)], check=True) cpus += 1 print("Configured RPS and RFS: %d rps_flow_cnt files and %d rps_cpus files" % (flow_cnt, cpus)) +def config_switch_buffer_limit(mbytes): + """ + Restrict the total amount of buffer space available for egress ports + in the top-of-rack switch. + mbytes: New limit, in Mbytes + """ + s = Switch(True) + s.set_buffer_limit(mbytes) + s.close() + +def config_switch_all_ports(): + """ + Modify the configuration of all egress ports at the top-of-rack switch + to enable optimal Homa performance. + """ + s = Switch(True) + s.config_all_ports() + s.close() + +def config_switch_ports(): + """ + Modify the configuration of the egress ports at the top-of-rack switch + to enable optimal Homa performance. + """ + s = Switch(True) + for port in get_exp_ports(): + print("Configuring TOR port for port %d:" % (port)) + s.config_port(port) + s.close() + def print_help(): print("Performs any of several configuration tasks to prepare a node for") print("running Homa applications.") print("Usage: config feature feature ...") print("\nEach feature may be one of the following:") - print(" --help Print this help text and exit") - print(" default Normal configuration for Homa: equivalent to") - print(" 'homa ~/bin/homa.ko ipv6 nic power rps'") - print(" homa HHH Install and configure the Homa kernel driver;") - print(" HHH is the path to the homa.ko file") - print(" ipv6 Set up routing information so that IPv6") - print(" addresses can be used.") - print(" mtu NNN Set the maximum packet length to NNN") - print(" nic Configure the NIC for optimal Homa performance") - print(" power Configure power management (e.g., C-states)") - print(" for best Homa performance") - print(" rps Set up (and enable) RPS and RFS") + print(" --help Print this help text and exit") + print(" --verbose Print details of commands executed and results") + print(" default Normal configuration for Homa: equivalent to") + print(" 'reset_qdisc homa ~/bin/homa.ko ipv6 nic power") + print(" rps'") + print(" ecn_threshold KB Set the ECN marking threshold for all ports in") + print(" the experiment to KB (Kbytes)") + print(" homa HHH Install and configure the Homa kernel driver;") + print(" HHH is the path to the homa.ko file") + print(" ipv6 Set up routing information so that IPv6") + print(" addresses can be used.") + print(" lb CONFIG Configure load balancing, including RSS, SoftIRQ") + print(" core selection, and various sysctl parameters,") + print(" as indicated by CONFIG (xl170_default, gen2,") + print(" gen3, or gen3_alt)") + print(" mtu NNN Set the maximum packet length to NNN") + print(" node_names Print out the names of all the nodes in ") + print(" the experiment") + print(" nic Configure the NIC for optimal Homa performance") + print(" power Configure power management (e.g., C-states)") + print(" for best Homa performance") + print(" print_rss Print out current RSS configuration") + print(" qdisc Install Homa's queuing discipline for all") + print(" of the NIC tx queues") + print(" reset_qdisc Uninstall Homa's queuing discipline for any") + print(" NIC queues where it is installed, and install") + print(" fq_codel in its place") + print(" reset_switch_all_ports Issue commands to TOR switch to restore original") + print(" port settings for all ports on the switch") + print(" (even those not used by current experiment)") + print(" reset_switch_ports Issue commands to TOR switch to restore") + print(" original port settings (only restores ports") + print(" used by the current experiment)") + print(" rps Set up (and enable) RPS and RFS") + print(" switch_buffer_limit MB Set max egress buffer space in TOR switch to") + print(" MB Mbytes") + print(" switch_all_ports Issue commands to TOR switch to configure ") + print(" egress ports for all ports on the switch") + print(" (even those not used by current experiment)") + print(" switch_ports Issue commands to TOR switch to configure ") + print(" egress ports for Homa (only configures ports") + print(" used by the current experiment)") + +# Force print to always flush (turn off line buffering). +print = functools.partial(print, flush=True) i = 1 while i < len(sys.argv): @@ -351,12 +831,22 @@ while i < len(sys.argv): if arg == "--help": print_help() exit(0) + elif arg == "--verbose": + verbose = True elif arg == "default": + config_reset_qdisc() config_homa("~/bin/homa.ko") - config_ipv6(10, vlan) + config_ipv6(get_num_nodes(), vlan) config_nic() config_power() config_rps() + elif arg == "ecn_threshold": + if i >= len(sys.argv): + raise Exception("No argument provided for 'ecn_threshold' command"); + exit(1) + kb = int(sys.argv[i]) + i += 1 + config_ecn_threshold(kb) elif arg == "homa": if i >= len(sys.argv): raise Exception("No argument provided for 'homa' command"); @@ -365,7 +855,13 @@ while i < len(sys.argv): i += 1 config_homa(mod) elif arg == "ipv6": - config_ipv6(10, vlan) + config_ipv6(get_num_nodes(), vlan) + elif arg == "lb": + if i >= len(sys.argv): + raise Exception("No argument provided for 'lb' command"); + exit(1) + config_lb(sys.argv[i]) + i += 1 elif arg == "mtu": if i >= len(sys.argv): raise Exception("No argument provided for 'mtu' command"); @@ -375,9 +871,34 @@ while i < len(sys.argv): config_mtu(mtu) elif arg == "nic": config_nic() + elif arg == "node_names": + for name in get_node_names(): + print(name) elif arg == "power": config_power() + elif arg == "print_rss": + print_rss() + elif arg == "qdisc": + config_qdisc() + elif arg == "reset_qdisc": + config_reset_qdisc() + elif arg == "reset_switch_all_ports": + config_reset_switch_all_ports() + elif arg == "reset_switch_ports": + config_reset_switch_ports() elif arg == "rps": config_rps() + elif arg == "switch_buffer_limit": + if i >= len(sys.argv): + raise Exception("No argument provided for 'switch_buffer_limit' command"); + exit(1) + config_switch_buffer_limit(float(sys.argv[i])) + i += 1 + elif arg == "switch_all_ports": + config_switch_all_ports() + elif arg == "switch_ports": + config_switch_ports() + elif arg == "test": + get_cpu_type() else: raise Exception("Unknown feature '%s'" % (arg)) diff --git a/cloudlab/bin/install b/cloudlab/bin/install deleted file mode 100755 index 8765cfb7..00000000 --- a/cloudlab/bin/install +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2020-2023 Stanford University -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE - -# This script installs all of the files needed to run Homa tests on one -# or more target machines; it also loads the Homa kernel module. -# -# Usage: -# install num_nodes [first] -# -# The "num_nodes" arguments indicates how many servers should be updated. -# The "first" argument is optional; it is an integer identifying the -# first node on which installation will occur (e.g. "install 4 2" means -# node2 through node5 will be updated. "first" defaults to 0. -# This script assumes that Homa has been built in ~/homaModule on the -# current machine (this includes both homa.ko and all of the binaries in util). - -root=~/homaModule - -set -e -if [ $# -eq 2 ]; then - first=$2 -elif [ $# -eq 1 ]; then - first=0 -else - echo "Usage: install num_nodes [first]" - exit 1 -fi -last=`expr $first + $1 - 1` || true - -for ((i = $first ; i <= $last; i++)); do - node=node$i - echo - echo '*** Installing on' $node '***' - rsync -e "ssh -o StrictHostKeyChecking=no" -rtv ~/.bashrc ~/.bash_profile ~/.gdbinit $node: - rsync -e "ssh -o StrictHostKeyChecking=no" -rtv ~/bin/ $node:bin/ - rsync -e "ssh -o StrictHostKeyChecking=no" -rtv $root/homa.ko $root/util/cp_node $root/util/homa_prio $root/util/*.py $node:bin/ - ssh $node 'echo $PATH' - ssh $node 'config default' -done \ No newline at end of file diff --git a/cloudlab/bin/install_homa b/cloudlab/bin/install_homa new file mode 100755 index 00000000..6749cda2 --- /dev/null +++ b/cloudlab/bin/install_homa @@ -0,0 +1,61 @@ +#!/bin/bash + +# Copyright (c) 2020-2023 Homa Developers +# SPDX-License-Identifier: BSD-1-Clause + +# This script installs all of the files needed to run Homa tests on one +# or more target machines; it also loads the Homa kernel module. +# +# Usage: +# install_homa [--net-next] [--verbose] num_nodes [first] +# +# The "num_nodes" arguments indicates how many servers should be updated. +# The "first" argument is optional; it is an integer identifying the +# first node on which installation will occur (e.g. "install 4 2" means +# node2 through node5 will be updated. "first" defaults to 0. +# This script assumes that the Homa module binary (homa.ko) has already +# been built. If --net-next is specified, it will be in the kernel build +# directory (see code below for path), otherwise it will be in ~/homaModule. +# In addition, the utility programs in ~/homaModule/util must have been built. + +root=~/homaModule + +set -e + +homa_ko=$root/homa.ko +verbose="" +rsync_switches="-rt" +while true; do + if [ $1 = "--net-next" ]; then + homa_ko=/netnext/net-next/net/homa/homa.ko + shift + elif [ $1 = "--verbose" ]; then + verbose=" --verbose" + rsync_switches="-rtv" + shift + else + break + fi +done +if [ $# -eq 2 ]; then + first=$2 +elif [ $# -eq 1 ]; then + first=0 +else + echo "Usage: install_homa [--net-next] num_nodes [first]" + exit 1 +fi +last=`expr $first + $1 - 1` || true + +for ((i = $first ; i <= $last; i++)); do + node=node$i + echo + echo '*** Installing Homa on' $node '***' + rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" $rsync_switches ~/.bashrc ~/.bash_profile ~/.gdbinit $node: + rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" $rsync_switches --exclude __pycache__ ~/bin/ $node:bin/ + rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" $rsync_switches $homa_ko $root/util/cp_node $root/util/homa_prio $root/util/server $root/util/homa_test $root/util/*.py $node:bin/ + ssh -4 $node 'sudo sysctl .kernel.printk="5 4 1 7" > /dev/null' + # ssh -4 $node 'echo $PATH' + ssh -4 $node "config$verbose default" + ssh -4 $node 'if ! grep -q mitigations=off /proc/cmdline; then echo WARNING: Meltdown/Spectre mitigations have not been disabled!; fi' +done \ No newline at end of file diff --git a/cloudlab/bin/loop b/cloudlab/bin/loop deleted file mode 100755 index 4ea900a6..00000000 --- a/cloudlab/bin/loop +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2020, Stanford University -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE - -# Run a cluster slowdown test in an infinite loop - -while true; do ~ouster/homaModule/util/cperf --nodes 10 -w w4 -l ~ouster/logs/w4 -b 2 -s 120 slowdown; done \ No newline at end of file diff --git a/cloudlab/bin/on_nodes b/cloudlab/bin/on_nodes index 7d133ad7..a13763d9 100755 --- a/cloudlab/bin/on_nodes +++ b/cloudlab/bin/on_nodes @@ -1,41 +1,29 @@ #!/bin/bash -# Copyright (c) 2020-2023 Stanford University -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE +# Copyright (c) 2020-2023 Homa Developers +# SPDX-License-Identifier: BSD-1-Clause # This uses ssh to run a given command on one or more nodes in the # cluster. # # Usage: -# on_nodes num_nodes cmd arg arg ... +# on_nodes num_first last cmd arg arg ... # -# The "num_nodes" arguments indicates how many nodes the command should -# be run on (starting at node0). The remaining arguments are a command +# The "first" and "last" arguments give the range of nodes (inclusive) on +# which the command should run. The remaining arguments are a command # and its arguments to run on the given machines -root=~/homaModule - -if [ $# -lt 2 ]; then - echo "Usage: on_nodes num_nodes cmd arg arg ..." +if [ $# -lt 3 ]; then + echo "Usage: on_nodes first last cmd arg arg ..." exit 1 fi -last=`expr $1 - 1` -shift +first=$1 +last=$2 +shift 2 -for ((i = 0 ; i <= $last; i++)); do +for ((i = $first ; i <= $last; i++)); do node=node$i echo "" echo $node: - ssh $node $@ + ssh -4 $node $@ done \ No newline at end of file diff --git a/cloudlab/bin/set_cutoffs b/cloudlab/bin/set_cutoffs index f0291028..931167cd 100755 --- a/cloudlab/bin/set_cutoffs +++ b/cloudlab/bin/set_cutoffs @@ -1,18 +1,7 @@ #!/bin/bash -# Copyright (c) 2020-2023 Stanford University -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE +# Copyright (c) 2020-2023 Homa Developers +# SPDX-License-Identifier: BSD-1-Clause # This script sets the cutoffs for unscheduled priorities on one or more # nodes to match the characteristics of the Homa workloads. diff --git a/cloudlab/bin/start_xl170 b/cloudlab/bin/start_xl170 deleted file mode 100755 index 0c052cd7..00000000 --- a/cloudlab/bin/start_xl170 +++ /dev/null @@ -1,115 +0,0 @@ -# As of January 2023 this script is obsolete and deprecated. It is -# retained (for now) a possibly useful historical reference - -# This script starts up the Homa module and configures it for the -# local system. If Homa was previously installed, it is uninstalled. -# Usage: -# start_xl170 [mod_file] - -set -e - -xl170=1 - -# Network interface name -ni=ens1f1np1 - -rps=1 -limit_rss=0 -for arg in $@; do - if [ $arg = no_rps ]; then - rps=0 - fi - done - -if [ $# -eq 1 ]; then - homa=$1 -else - homa=~/homaModule/homa.ko -fi - -sudo rmmod homa || true -sudo insmod $homa - -if [ $xl170 -eq 1 ] ; then - echo Configuring for xl170 - sudo sysctl /net/homa/link_mbps=25000 - sudo sysctl /net/homa/max_nic_queue_ns=2000 - sudo sysctl /net/homa/rtt_bytes=60000 - sudo cpupower frequency-set -g performance -else - echo Configuring for c6525-100g - sudo sysctl /net/homa/link_mbps=100000 - sudo sysctl /net/homa/max_nic_queue_ns=5000 - sudo sysctl /net/homa/rtt_bytes=200000 - sudo sysctl /net/homa/max_gso_size=100000 -fi - -sudo sysctl /net/homa/num_priorities=8 -# sudo sysctl /net/homa/pacer_fifo_fraction=0 -# sudo sysctl /net/homa/grant_fifo_fraction=0 -# sudo sysctl /net/homa/verbose=1 -sudo ethtool -C $ni adaptive-rx off rx-usecs 5 rx-frames 1 || true -# sudo sysctl .net.core.netdev_budget=300 -# sudo sysctl .net.ipv4.tcp_congestion_control=cubic - -# Clean metrics for metrics.py -rm -f ~/.homa_metrics - -# Turn on RPS and RFS -if [ $rps -eq 1 ]; then - echo Turning on RPS/RFS - sudo sysctl -w net.core.rps_sock_flow_entries=32768 - for f in /sys/class/net/$ni/queues/rx-*/rps_flow_cnt; do - sudo bash -c "echo 2048 > $f" - done - for f in /sys/class/net/$ni/queues/rx-*/rps_cpus; do - sudo bash -c "echo fffff > $f" - done - sudo ethtool -K $ni ntuple on -else - echo Turning off RPS/RFS - sudo sysctl -w net.core.rps_sock_flow_entries=0 - for f in /sys/class/net/$ni/queues/rx-*/rps_flow_cnt; do - sudo bash -c "echo 0 > $f" - done - for f in /sys/class/net/$ni/queues/rx-*/rps_cpus; do - sudo bash -c "echo 00000 > $f" - done - sudo ethtool -K $ni ntuple off -fi - -if false; then -# This code may be broken now, given new Linux version -echo "This code shouldn't run!!" -if [ $limit_rss -eq 1 ]; then - echo Limiting RSS channels - sudo bash -c "echo 16 > /proc/irq/117/smp_affinity_list" - sudo bash -c "echo 18 > /proc/irq/118/smp_affinity_list" - sudo ethtool -L $ni combined 2 -else - echo Using all RSS channels - # Note: the first line below corresponds to device - # mlx5_comp0@pci:0000:03:00.1 in the printout from 'cat /proc/interrupts'?? - sudo bash -c "echo 0 > /proc/irq/106/smp_affinity_list" - sudo bash -c "echo 1 > /proc/irq/107/smp_affinity_list" - sudo bash -c "echo 2 > /proc/irq/108/smp_affinity_list" - sudo bash -c "echo 3 > /proc/irq/109/smp_affinity_list" - sudo bash -c "echo 4 > /proc/irq/110/smp_affinity_list" - sudo bash -c "echo 5 > /proc/irq/111/smp_affinity_list" - sudo bash -c "echo 6 > /proc/irq/112/smp_affinity_list" - sudo bash -c "echo 7 > /proc/irq/113/smp_affinity_list" - sudo bash -c "echo 8 > /proc/irq/114/smp_affinity_list" - sudo bash -c "echo 9 > /proc/irq/115/smp_affinity_list" - sudo bash -c "echo 10 > /proc/irq/116/smp_affinity_list" - sudo bash -c "echo 11 > /proc/irq/117/smp_affinity_list" - sudo bash -c "echo 12 > /proc/irq/118/smp_affinity_list" - sudo bash -c "echo 13 > /proc/irq/119/smp_affinity_list" - sudo bash -c "echo 14 > /proc/irq/120/smp_affinity_list" - sudo bash -c "echo 15 > /proc/irq/121/smp_affinity_list" - sudo bash -c "echo 16 > /proc/irq/122/smp_affinity_list" - sudo bash -c "echo 17 > /proc/irq/123/smp_affinity_list" - sudo bash -c "echo 18 > /proc/irq/124/smp_affinity_list" - sudo bash -c "echo 19 > /proc/irq/125/smp_affinity_list" - sudo ethtool -L $ni combined 20 -fi -fi \ No newline at end of file diff --git a/cloudlab/bin/switch.py b/cloudlab/bin/switch.py new file mode 100755 index 00000000..777d7c2b --- /dev/null +++ b/cloudlab/bin/switch.py @@ -0,0 +1,191 @@ +#!/usr/bin/python3 + +# Copyright (c) 2023 Homa Developers +# SPDX-License-Identifier: BSD-1-Clause + +# This file defines the Switch class. + +import fcntl +import os +import re +import subprocess +import sys +import time + +# A Switch object represents a Mellanox top-of-rack switch for a CloudLab +# experiment, and it provides various operations on the switch such +# as configuring ports and querying statistics such as maximum buffer +# usage. +class Switch: + def __init__(self, verbose=False): + self.verbose = verbose + + # Open an ssh connection to the switch. + self.ssh = subprocess.Popen(["ssh", "-T", "-p", "51295", + "-o", "HostKeyAlgorithms=+ssh-rsa", + "-o", "PubkeyAcceptedKeyTypes=+ssh-rsa", "admin@localhost"], + encoding="utf-8", stdin=subprocess.PIPE, stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + fl = fcntl.fcntl(self.ssh.stdout, fcntl.F_GETFL) + fcntl.fcntl(self.ssh.stdout, fcntl.F_SETFL, fl | os.O_NONBLOCK) + + # Enter config mode + self.do_cmd("enable") + self.do_cmd("configure terminal") + + def close(self): + """ + Shut down the ssh connection to the switch; this object will become + unusable. + """ + self.ssh.terminate() + + def do_cmd(self, command, time_limit=5.0): + """ + Invoke a command on the switch and wait for it to complete. If a + long time goes by without a command prompt, an exception is thrown. + All of the output is returned. + command: Command to invoke; if empty, then no command is invoked + (but we wait for a prompt to appear). + time_limit: An error will be generated if this many seconds go by + without the appearance of a command prompt. + """ + if self.verbose: + print(command) + if len(command) > 0: + print(command, file=self.ssh.stdin) + # Because we're running ssh without a pseudo-tty (which is necessary, + # because otherwise ssh doesn't exit if this process exits), the + # switch doesn't generate any prompts. So, output a bogus command + # and use the error message from that command as an indication that + # the earlier command has completed. + print('xyzzy', file=self.ssh.stdin, flush=True) + + output = "" + start_time = time.time() + while True: + if time.time() > (start_time + time_limit): + raise Exception("switch command '%s' didn't complete; output:\n%s" + % (command.rstrip(), output)) + data = self.ssh.stdout.read(1000) + if data != "": + output += data + if re.search('Unrecognized command.*xyzzy.*help', output, + flags=re.DOTALL): + return output + time.sleep(0.1) + + def get_max_buffer_usage(self): + """ + Return the maximum total buffer usage (across all egress ports). + """ + output = self.do_cmd("show buffers pools ePool0") + match = re.search(r'.*ePool0\s+egress.*[0-9.]+M?\s+[0-9.]+M?\s+([0-9.]+)([MK]?)', + output) + if match: + if match.group(2) == 'M': + return float(match.group(1)) + elif match.group(2) == 'K': + return float(match.group(1))/1000.0 + else: + return float(match.group(1))/1e06 + raise Exception("Switch.get_max_buffer_usage couldn't find " + "information for ePool0; here is the output:\n%s" % (output)) + + def clear_max_buffer_usage(self): + """ + Reset the maximum total buffer usage so that it will recompute + starting now. + """ + self.do_cmd("clear buffers pool max-usage") + + def config_port(self, port): + """ + Configure the settings on a particular egress port to meet Homa's + needs. + port: Index of the port to configure. + """ + + # Enable priority queues for Homa. + self.do_cmd("interface ethernet 1/%d qos trust both" % (port)) + for tc in range(8): + self.do_cmd("interface ethernet 1/%d traffic-class %d dcb ets strict" % + (port, tc)) + + # Enable large packets. + self.do_cmd("interface ethernet 1/%d mtu 9216 force" % (port)) + + # Set DCTCP marking thresholds. + self.do_cmd("interface ethernet 1/%d traffic-class 0 congestion-control ecn " + "minimum-absolute 70 maximum-absolute 70" % (port)) + self.do_cmd("interface ethernet 1/%d traffic-class 1 congestion-control ecn " + "minimum-absolute 70 maximum-absolute 70" % (port)) + + def config_all_ports(self): + """ + Invoke config_port on all of the egress ports for the switch. + """ + + for port in range(1, 41): + self.config_port(port) + + def reset_port(self, port): + """ + Restore default settings for a port (undo the effects of a previous + call to config_port). + """ + + # Restore QOS priorities. + self.do_cmd("interface ethernet 1/%d no qos trust" % (port)) + for tc in range(8): + print("interface ethernet 1/%d traffic-class %d no dcb ets" % + (port, tc)) + + # Disable large packets + self.do_cmd("interface ethernet 1/%d mtu 1500 force" % (port)) + + # Reset DCTCP marking thresholds: + self.do_cmd("interface ethernet 1/%d no traffic-class 0 congestion-control" + % (port)) + self.do_cmd("interface ethernet 1/%d no traffic-class 1 congestion-control" + % (port)) + + def reset_all_ports(self): + """ + Invoke resetport on all of the egress ports for the switch. + """ + + for port in range(1, 41): + self.reset_port(port) + + def set_buffer_limit(self, mbytes): + """ + Configure the switch to limit the total amount of buffer space + in egress ports to a given amount. + mbytes: Desired limit, in Mbytes + """ + self.do_cmd("advance buffer management force") + self.do_cmd("pool ePool0 size %.3fM type dynamic" % (mbytes)) + + def set_ecn_threshold(self, port, kb): + """ + Set the ECN marking threshold for a given port. + + port: The port to configure + kb: Value to set for the marking threshold, in KB + """ + + self.do_cmd("interface ethernet 1/%d traffic-class 0 congestion-control ecn " + "minimum-absolute %d maximum-absolute %d" % (port, kb, kb)) + self.do_cmd("interface ethernet 1/%d traffic-class 1 congestion-control ecn " + "minimum-absolute %d maximum-absolute %d" % (port, kb, kb)) + + def set_all_ecn_thresholds(self, kb): + """ + Set ECN marking threshold for all of the ports in the switch. + + kb: Value to set for the marking threshold, in KB + """ + + for port in range(1, 41): + self.set_ecn_threshold(port, kb) diff --git a/cloudlab/bin/update_linux b/cloudlab/bin/update_linux index b2fff3f9..fe22b543 100755 --- a/cloudlab/bin/update_linux +++ b/cloudlab/bin/update_linux @@ -1,18 +1,7 @@ #!/bin/bash -# Copyright (c) 2020-2023 Stanford University -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE +# Copyright (c) 2020-2023 Homa Developers +# SPDX-License-Identifier: BSD-1-Clause # This script uses files on the current machine to update the kernel one # or more other machines and reboot to those machines. @@ -26,8 +15,8 @@ # (e.g. "update_linux 4 2" means node2 through node5 will be updated). # "first" defaults to 1. -#v=`uname -r` -v=5.17.7+ +v=`uname -r` +#v=5.17.7+ if [ $# -eq 2 ]; then first=$2 @@ -43,9 +32,12 @@ for ((i = $first ; i <= $last; i++)); do node=node$i echo echo $node - ssh $node 'rm -rf tmp; mkdir -p tmp tmp/boot tmp/8021q' - rsync -rtv /boot/initrd.img-$v /boot/config-$v /boot/System.map-$v \ + + # Forcing IPv4 below helps on CloudLab nodes (otherwise there will be + # a 10-20 second delay for each node if Homa hasn't been installed + # (tries IPv6 first?) + ssh -4 $node 'rm -rf tmp; mkdir -p tmp tmp/boot' + rsync --ipv4 -rtv /boot/initrd.img-$v /boot/config-$v /boot/System.map-$v \ /boot/vmlinuz-$v $node:tmp/boot/ - rsync -rtv /lib/modules/$v/kernel/net/8021q/ $node:tmp/8021q/ - ssh $node "sudo cp -f tmp/boot/* /boot; sudo cp -f tmp/8021q/8021q.ko /lib/modules/$v/kernel/net/8021q; sudo reboot" + ssh -4 $node "sudo cp -f tmp/boot/* /boot; sudo reboot" done diff --git a/cloudlab/bin/update_qdisc b/cloudlab/bin/update_qdisc new file mode 100755 index 00000000..7f4bae0d --- /dev/null +++ b/cloudlab/bin/update_qdisc @@ -0,0 +1,37 @@ +#!/bin/bash + +# Copyright (c) 2024 Homa Developers +# SPDX-License-Identifier: BSD-1-Clause + +# Update qdisc modules on one or more machines, based on information +# on the current machine. Must run this in the root linux build directory +# +# Usage: +# update_qdisc num_nodes [first] +# +# The "num_nodes" arguments indicates how many nodes the command should +# be run on (starting at node1). The "first" argument is optional; it is +# an integer identifying the first node on which installation will occur +# (e.g. "update_qdisc 4 2" means node2 through node5 will be updated). +# "first" defaults to 1. + +v=`uname -r` +#v=5.17.7+ + +if [ $# -eq 2 ]; then + first=$2 +elif [ $# -eq 1 ]; then + first=1 +else + echo "Usage: update_qdisc num_nodes [first]" + exit 1 +fi +last=`expr $first + $1 - 1` + +for ((i = $first ; i <= $last; i++)); do + node=node$i + echo + echo $node + rsync -rtv net/sched/sch_fq_codel.ko $node:sched/ + ssh $node 'sudo rsync -rtv sched/ /lib/modules/`uname -r`/kernel/net/sched/' +done diff --git a/cloudlab/config_switch b/cloudlab/config_switch index 0610f097..e801f8cb 100755 --- a/cloudlab/config_switch +++ b/cloudlab/config_switch @@ -1,18 +1,7 @@ #!/usr/bin/python3 -# Copyright (c) 2020 Stanford University -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2020-2023 Homa Developers +# SPDX-License-Identifier: BSD-1-Clause # This program outputs the commands needed to configure a CloudLab switch # for Homa, or to reset it. @@ -24,7 +13,7 @@ import sys # Ports to configure -nodes = [104, 102] +nodes = [5, 20, 32, 34, 35, 38] ports = [] prev_switch = -1 for node in nodes: diff --git a/cloudlab/dell_switch.txt b/cloudlab/dell_switch.txt new file mode 100644 index 00000000..5320dbc3 --- /dev/null +++ b/cloudlab/dell_switch.txt @@ -0,0 +1,229 @@ +! Commands for configuring a Dell switch (OS 10.6.0) to enable priority +! queues with strict priority. First, enter configure mode: +enable +configure terminal + +! Create maps from DSCP classes to egress queues, and from queues +! to service policies. +class-map type queuing pq_cm0 +match queue 0 +exit +class-map type queuing pq_cm1 +match queue 1 +exit +class-map type queuing pq_cm2 +match queue 2 +exit +class-map type queuing pq_cm3 +match queue 3 +exit +class-map type queuing pq_cm4 +match queue 4 +exit +class-map type queuing pq_cm5 +match queue 5 +exit +class-map type queuing pq_cm6 +match queue 6 +exit +class-map type queuing pq_cm7 +match queue 7 +exit + +policy-map type queuing pq_pmap +class pq_cm0 +priority +class pq_cm1 +priority +class pq_cm2 +priority +class pq_cm3 +priority +class pq_cm4 +priority +class pq_cm5 +priority +class pq_cm6 +priority +class pq_cm7 +priority +exit + +! Configure all ports to use the trust-map (on input) and the +! policy-map (for output) by default. +system qos +trust-map dscp default +service-policy output type queuing pq_pmap +exit + +! Create a qos-map that maps all traffic classes to queue 0 of egress ports +qos-map traffic-class qos_map_null +queue 0 qos-group 0-7 type ucast +queue 0 qos-group 0-7 type mcast +exit + +! Apply the above qos-map to a few specific ports: this overrides the global +! default so that there are no priorities for these egress ports (queue 0 is +! used for all traffic). +interface ethernet 1/1/15 +qos-map traffic-class qos_map_null +exit +interface ethernet 1/1/16 +qos-map traffic-class qos_map_null +exit +interface ethernet 1/1/17 +qos-map traffic-class qos_map_null +exit +interface ethernet 1/1/18 +qos-map traffic-class qos_map_null +exit +interface ethernet 1/1/47 +qos-map traffic-class qos_map_null +exit +interface ethernet 1/1/48 +qos-map traffic-class qos_map_null +exit +interface ethernet 1/1/49 +qos-map traffic-class qos_map_null +exit +interface ethernet 1/1/50 +qos-map traffic-class qos_map_null +exit + +! Do not apply commands below here: I've provided these to show how to +! undo the effects of the commands above, plus a few other things in +! case you need them in the future. + +! Remove the override for specific ports: +interface ethernet 1/1/36:3 +no qos-map traffic-class +exit +interface ethernet 1/1/28:1 +no qos-map traffic-class +exit +interface ethernet 1/1/57:3 +no qos-map traffic-class +exit +interface ethernet 1/1/39:3 +no qos-map traffic-class +exit +interface ethernet 1/1/31:2 +no qos-map traffic-class +exit +interface ethernet 1/1/62:1 +no qos-map traffic-class +exit +interface ethernet 1/1/36:2 +no qos-map traffic-class +exit +interface ethernet 1/1/59:3 +no qos-map traffic-class +exit +interface ethernet 1/1/58:2 +no qos-map traffic-class +exit +interface ethernet 1/1/31:4 +no qos-map traffic-class +exit +interface ethernet 1/1/7:4 +no qos-map traffic-class +exit + +! Remove the global default +system qos +no trust-map dscp +no service-policy output type queuing +exit + +! Configure specific interfaces to use the trust-map (on input) and policy-map +! (for output). This is an alternative to the global default. +interface ethernet 1/1/36:3 +trust-map dscp default +service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/28:1 +trust-map dscp default +service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/57:3 +trust-map dscp default +service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/39:3 +trust-map dscp default +service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/31:2 +trust-map dscp default +service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/62:1 +trust-map dscp default +service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/36:2 +trust-map dscp default +service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/59:3 +trust-map dscp default +service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/58:2 +trust-map dscp default +service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/31:4 +trust-map dscp default +service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/7:4 +trust-map dscp default +service-policy output type queuing pq_pmap +exit + +! Remove the port-specific configuration. +interface ethernet 1/1/36:3 +no trust-map dscp +no service-policy output type queuing +exit +interface ethernet 1/1/28:1 +no trust-map dscp +no service-policy output type queuing +exit +interface ethernet 1/1/57:3 +no trust-map dscp +no service-policy output type queuing +exit +interface ethernet 1/1/39:3 +no trust-map dscp +no service-policy output type queuing +exit +interface ethernet 1/1/31:2 +no trust-map dscp +no service-policy output type queuing +exit +interface ethernet 1/1/62:1 +no trust-map dscp +no service-policy output type queuing +exit +interface ethernet 1/1/36:2 +no trust-map dscp +no service-policy output type queuing +exit +interface ethernet 1/1/59:3 +no trust-map dscp +no service-policy output type queuing +exit +interface ethernet 1/1/58:2 +no trust-map dscp +no service-policy output type queuing +exit +interface ethernet 1/1/31:4 +no trust-map dscp +no service-policy output type queuing +exit +interface ethernet 1/1/7:4 +no trust-map dscp +no service-policy output type queuing +exit diff --git a/cloudlab/gdbinit b/cloudlab/gdbinit index dc4bcea5..22b8b08a 100644 --- a/cloudlab/gdbinit +++ b/cloudlab/gdbinit @@ -1 +1,2 @@ set style address foreground green +set debuginfod enabled off diff --git a/cloudlab/update b/cloudlab/update index 9ad1c749..8b73ee56 100755 --- a/cloudlab/update +++ b/cloudlab/update @@ -1,18 +1,7 @@ #!/bin/sh -# Copyright (c) 2019-2020, Stanford University -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE +# Copyright (c) 2019-2020 Homa Developers +# SPDX-License-Identifier: BSD-1-Clause # This script copies modified information from this directory to the # CloudLab machines given by the arguments (defaults are provided if no diff --git a/dissector/CMakeLists.txt b/dissector/CMakeLists.txt new file mode 100644 index 00000000..977482cf --- /dev/null +++ b/dissector/CMakeLists.txt @@ -0,0 +1,69 @@ +# CMakeLists.txt +# +# Copyright 2023 Missing Link Electronics Inc, +# Björn Petersen +# +# This code is dual licensed under one of the following 2 licenses: +# +# ################ +# # GPL2 License # +# ################ +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# +# +# ################ +# # HOMA License # +# ################ +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +cmake_minimum_required(VERSION 3.1) + +project(HomaDissector VERSION 0.0.1 DESCRIPTION "Wireshark Homa Plugin" LANGUAGES C) +option(INSTALL_PLUGIN_LOCAL "Install the homa dissector plugin inside the local folder of wireshark" ON) + +find_package(Wireshark CONFIG REQUIRED) + +if (NOT Wireshark_PLUGINS_ENABLED) + message(FATAL_ERROR "Wireshark was compiled without support for plugins") +endif () + +set(CMAKE_C_VISIBILITY_PRESET hidden) +if (CMAKE_COMPILER_IS_GNUCC) + set(CMAKE_C_FLAGS "-Wall -Wextra ${CMAKE_C_FLAGS}") +endif () + +add_definitions(-DVERSION=\"${PROJECT_VERSION}\") + +add_library(${PROJECT_NAME} MODULE homa.c) +set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "" DEFINE_SYMBOL "") +target_link_libraries(${PROJECT_NAME} epan) + +if (${INSTALL_PLUGIN_LOCAL}) + install(TARGETS ${PROJECT_NAME} LIBRARY DESTINATION "$ENV{HOME}/.local/lib/wireshark/plugins/${Wireshark_MAJOR_VERSION}.${Wireshark_MINOR_VERSION}/epan" NAMELINK_SKIP) +else () + install(TARGETS ${PROJECT_NAME} LIBRARY DESTINATION "${Wireshark_PLUGIN_INSTALL_DIR}/epan" NAMELINK_SKIP) +endif () diff --git a/dissector/README.md b/dissector/README.md new file mode 100644 index 00000000..dca7c25e --- /dev/null +++ b/dissector/README.md @@ -0,0 +1,29 @@ +# HOMA Dissector + +A dissector for viewing [homa](https://homa-transport.atlassian.net/wiki/spaces/HOMA/overview) packets. The dissector +was tested with Ubuntu 18.04 and Ubuntu 22.04 with the Wireshark version 3.6.2 + +## Prerequisites + +The dissector is a cmake based wireshark plugin. For building please make sure that the required wireshark dependencies, +including wireshark headers, are installed. For Debian based systems the following command line may be +used: `apt install wireshark-dev wireshark-common` + +## Installation + +The Plugin can be installed with the following steps. + +```shell +cmake . +make +make install +``` + +Per default, the plugin will be installed inside the local plugin folder of wireshark. For installing the plugin global +on your system, run the following command: + +```shell +cmake -DINSTALL_PLUGIN_LOCAL=OFF . +make +sudo make install +``` diff --git a/dissector/homa.c b/dissector/homa.c new file mode 100644 index 00000000..dc1d902f --- /dev/null +++ b/dissector/homa.c @@ -0,0 +1,408 @@ +/* homa.c + * Wireshark HOMA Plugin + * + * Copyright 2023 Missing Link Electronics Inc, + * Björn Petersen + * + * This code is dual licensed under one of the following 2 licenses: + * + * + * ################ + * # GPL2 License # + * ################ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * + * ################ + * # HOMA License # + * ################ + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include + +#ifndef VERSION +#define VERSION "0.0.1" +#endif + +WS_DLL_PUBLIC_DEF const gchar plugin_version[] = VERSION; +WS_DLL_PUBLIC_DEF const int plugin_want_major = WIRESHARK_VERSION_MAJOR; +WS_DLL_PUBLIC_DEF const int plugin_want_minor = WIRESHARK_VERSION_MINOR; + +#define HOMA_PROTO 0xFD + +#define HOMA_HEADER_TYPE_OFFSET 13 +#define HOMA_DATA_PACKET 0x10 +#define HOMA_GRANT_PACKET 0x11 +#define HOMA_RESEND_PACKET 0x12 +#define HOMA_UNKNOWN_PACKET 0x13 +#define HOMA_BUSY_PACKET 0x14 +#define HOMA_CUTOFFS_PACKET 0x15 +#define HOMA_FREEZE_PACKET 0x16 +#define HOMA_NEED_ACK_PACKET 0x17 +#define HOMA_ACK_PACKET 0x18 + +#define COMMON_HEADER_LENGTH 28 +#define HOMA_ACK_LENGTH 12 +#define DATA_SEGMENT_LENGTH (8 + HOMA_ACK_LENGTH) +#define DATA_HEADER_LENGTH (12 + DATA_SEGMENT_LENGTH) +#define RESEND_HEADER_LENGTH 9 +#define GRANT_HEADER_LENGTH 5 +#define CUTOFFS_HEADER_LENGTH 34 +#define ACK_HEADER_LENGTH 62 + +static int proto_homa = -1; + +static int hf_homa_common_sport = -1; +static int hf_homa_common_dport = -1; +static int hf_homa_common_doff = -1; +static int hf_homa_common_type = -1; +static int hf_homa_common_sender_id = -1; +static int hf_homa_data_message_length = -1; +static int hf_homa_data_incoming = -1; +static int hf_homa_data_cutoff_version = -1; +static int hf_homa_data_retransmit = -1; +static int hf_homa_data_offset = -1; +static int hf_homa_data_segment_length = -1; +static int hf_homa_ack_client_id = -1; +static int hf_homa_ack_client_port = -1; +static int hf_homa_ack_server_port = -1; +static int hf_homa_grant_offset = -1; +static int hf_homa_grant_priority = -1; +static int hf_homa_resend_offset = -1; +static int hf_homa_resend_length = -1; +static int hf_homa_resend_priority = -1; +static int hf_homa_ack_num_acks = -1; +static int hf_homa_cutoff_unsched_cutoffs = -1; +static int hf_homa_cutoff_version = -1; + +static int ett_homa_common = -1; + +static int dissect_homa(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree _U_, + void *data _U_) +{ + col_set_str(pinfo->cinfo, COL_PROTOCOL, "Homa"); + /* Clear the info column */ + col_clear(pinfo->cinfo, COL_INFO); + gint header_length = COMMON_HEADER_LENGTH; + gint homa_packet_type = tvb_get_guint8(tvb, HOMA_HEADER_TYPE_OFFSET); + switch (homa_packet_type) { // Calculate Length of Header depending on the header type + case HOMA_DATA_PACKET: + header_length += DATA_HEADER_LENGTH; + break; + case HOMA_RESEND_PACKET: + header_length += RESEND_HEADER_LENGTH; + break; + case HOMA_GRANT_PACKET: + header_length += GRANT_HEADER_LENGTH; + break; + case HOMA_ACK_PACKET: + header_length += ACK_HEADER_LENGTH; + break; + case HOMA_CUTOFFS_PACKET: + header_length += CUTOFFS_HEADER_LENGTH; + break; + } + proto_item *ti = proto_tree_add_item(tree, proto_homa, tvb, 0, + header_length, ENC_NA); + proto_tree *homa_tree = proto_item_add_subtree(ti, ett_homa_common); + proto_tree *homa_tree_common = NULL; + switch (homa_packet_type) { // Select tree for information + case HOMA_DATA_PACKET: + case HOMA_GRANT_PACKET: + case HOMA_CUTOFFS_PACKET: + case HOMA_ACK_PACKET: + case HOMA_RESEND_PACKET: + homa_tree_common = proto_tree_add_subtree(homa_tree, tvb, 0, + COMMON_HEADER_LENGTH, + 0, &ti, + "Common Header"); + break; + case HOMA_NEED_ACK_PACKET: + col_set_str(pinfo->cinfo, COL_INFO, "Need ACK Packet"); + homa_tree_common = proto_tree_add_subtree(homa_tree, tvb, 0, + COMMON_HEADER_LENGTH, + 0, &ti, + "Need ACK Header"); + break; + case HOMA_FREEZE_PACKET: + col_set_str(pinfo->cinfo, COL_INFO, "Freeze Packet"); + homa_tree_common = proto_tree_add_subtree(homa_tree, tvb, 0, + COMMON_HEADER_LENGTH, + 0, &ti, + "Freeze Header"); + break; + case HOMA_BUSY_PACKET: + col_set_str(pinfo->cinfo, COL_INFO, "Busy Packet"); + homa_tree_common = proto_tree_add_subtree(homa_tree, tvb, 0, + COMMON_HEADER_LENGTH, + 0, &ti, + "Busy Header"); + break; + case HOMA_UNKNOWN_PACKET: + default: + col_set_str(pinfo->cinfo, COL_INFO, "Unknown Packet"); + homa_tree_common = proto_tree_add_subtree( + homa_tree, tvb, 0, COMMON_HEADER_LENGTH, 0, &ti, + "Unknown Paket Header"); + break; + } + proto_tree_add_item(homa_tree_common, hf_homa_common_sport, tvb, 0, 2, + ENC_BIG_ENDIAN); + proto_tree_add_item(homa_tree_common, hf_homa_common_dport, tvb, 2, 2, + ENC_BIG_ENDIAN); + proto_tree_add_item(homa_tree_common, hf_homa_common_doff, tvb, 12, 1, + ENC_BIG_ENDIAN); + proto_tree_add_item(homa_tree_common, hf_homa_common_type, tvb, + HOMA_HEADER_TYPE_OFFSET, 1, ENC_BIG_ENDIAN); + proto_tree_add_item(homa_tree_common, hf_homa_common_sender_id, tvb, 20, + 8, ENC_BIG_ENDIAN); + + switch (homa_packet_type) { // Fill in header fields + case HOMA_DATA_PACKET: + col_set_str(pinfo->cinfo, COL_INFO, "Data Packet"); + proto_tree *homa_tree_data = proto_tree_add_subtree( + homa_tree, tvb, COMMON_HEADER_LENGTH, + header_length - COMMON_HEADER_LENGTH, 0, &ti, + "Data Header"); + proto_tree_add_item(homa_tree_data, hf_homa_data_message_length, + tvb, COMMON_HEADER_LENGTH, 4, + ENC_BIG_ENDIAN); + proto_tree_add_item(homa_tree_data, hf_homa_data_incoming, tvb, + COMMON_HEADER_LENGTH + 4, 4, + ENC_BIG_ENDIAN); + proto_tree_add_item(homa_tree_data, hf_homa_data_cutoff_version, + tvb, COMMON_HEADER_LENGTH + 8, 2, + ENC_BIG_ENDIAN); + proto_tree_add_item(homa_tree_data, hf_homa_data_retransmit, + tvb, COMMON_HEADER_LENGTH + 10, 1, + ENC_BIG_ENDIAN); + proto_tree_add_item(homa_tree_data, hf_homa_data_offset, tvb, + COMMON_HEADER_LENGTH + 12, 4, + ENC_BIG_ENDIAN); + proto_tree_add_item(homa_tree_data, hf_homa_data_segment_length, + tvb, COMMON_HEADER_LENGTH + 16, 4, + ENC_BIG_ENDIAN); + proto_tree_add_item(homa_tree_data, hf_homa_ack_client_id, tvb, + COMMON_HEADER_LENGTH + 20, 8, + ENC_BIG_ENDIAN); + proto_tree_add_item(homa_tree_data, hf_homa_ack_client_port, + tvb, COMMON_HEADER_LENGTH + 28, 2, + ENC_BIG_ENDIAN); + proto_tree_add_item(homa_tree_data, hf_homa_ack_server_port, + tvb, COMMON_HEADER_LENGTH + 30, 2, + ENC_BIG_ENDIAN); + break; + case HOMA_RESEND_PACKET: + col_set_str(pinfo->cinfo, COL_INFO, "Resend Packet"); + proto_tree *homa_tree_resend = proto_tree_add_subtree( + homa_tree, tvb, COMMON_HEADER_LENGTH, + header_length - COMMON_HEADER_LENGTH, 0, &ti, + "Resend Header"); + proto_tree_add_item(homa_tree_resend, hf_homa_resend_offset, + tvb, COMMON_HEADER_LENGTH, 4, + ENC_BIG_ENDIAN); + proto_tree_add_item(homa_tree_resend, hf_homa_resend_length, + tvb, COMMON_HEADER_LENGTH + 4, 4, + ENC_BIG_ENDIAN); + proto_tree_add_item(homa_tree_resend, hf_homa_resend_priority, + tvb, COMMON_HEADER_LENGTH + 8, 1, + ENC_BIG_ENDIAN); + break; + case HOMA_GRANT_PACKET: + col_set_str(pinfo->cinfo, COL_INFO, "Grant Packet"); + proto_tree *homa_tree_grant = proto_tree_add_subtree( + homa_tree, tvb, COMMON_HEADER_LENGTH, + header_length - COMMON_HEADER_LENGTH, 0, &ti, + "Grant Header"); + proto_tree_add_item(homa_tree_grant, hf_homa_grant_offset, tvb, + COMMON_HEADER_LENGTH, 4, ENC_BIG_ENDIAN); + proto_tree_add_item(homa_tree_grant, hf_homa_grant_priority, + tvb, COMMON_HEADER_LENGTH + 4, 1, + ENC_BIG_ENDIAN); + break; + case HOMA_ACK_PACKET: + col_set_str(pinfo->cinfo, COL_INFO, "ACK Packet"); + proto_tree *homa_tree_ack = proto_tree_add_subtree( + homa_tree, tvb, COMMON_HEADER_LENGTH, + header_length - COMMON_HEADER_LENGTH, 0, &ti, + "ACK Header"); + proto_tree_add_item(homa_tree_ack, hf_homa_ack_num_acks, tvb, + COMMON_HEADER_LENGTH, 2, ENC_BIG_ENDIAN); + break; + + case HOMA_CUTOFFS_PACKET: + col_set_str(pinfo->cinfo, COL_INFO, "Cutoff Packet"); + proto_tree *homa_tree_cutoff = proto_tree_add_subtree( + homa_tree, tvb, COMMON_HEADER_LENGTH, + header_length - COMMON_HEADER_LENGTH, 0, &ti, + "Cutoff Header"); + proto_tree_add_item(homa_tree_cutoff, + hf_homa_cutoff_unsched_cutoffs, tvb, + COMMON_HEADER_LENGTH, 32, ENC_BIG_ENDIAN); + proto_tree_add_item(homa_tree_cutoff, hf_homa_cutoff_version, + tvb, COMMON_HEADER_LENGTH + 32, 2, + ENC_BIG_ENDIAN); + break; + } + call_data_dissector(tvb_new_subset_remaining(tvb, header_length), pinfo, + tree); + tvb_reported_length_remaining(tvb, header_length); + tvb_set_reported_length(tvb, header_length); + return tvb_reported_length(tvb); +} + +static void proto_register_homa(void) +{ + static hf_register_info hf_common[] = { + { &hf_homa_common_sport, + { "Homa source port", "homa.sport", FT_UINT16, BASE_DEC, NULL, + 0x0, NULL, HFILL } }, + { &hf_homa_common_dport, + { "Homa dest port", "homa.dport", FT_UINT16, BASE_DEC, NULL, + 0x0, NULL, HFILL } }, + { &hf_homa_common_type, + { "Homa packet type", "homa.type", FT_UINT8, BASE_HEX, NULL, + 0x0, NULL, HFILL } }, + { &hf_homa_common_doff, + { "Homa data offset", "homa.doff", FT_UINT8, BASE_DEC, NULL, + 0xF0, NULL, HFILL } }, + { &hf_homa_common_sender_id, + { "Homa sender ID", "homa.id", FT_UINT64, BASE_DEC, NULL, 0x0, + NULL, HFILL } } + }; + static hf_register_info hf_data[] = { + { &hf_homa_data_message_length, + { "Homa message length", "homa.length", FT_UINT32, BASE_DEC, + NULL, 0x0, NULL, HFILL } }, + { &hf_homa_data_incoming, + { "Homa incoming", "homa.incoming", FT_UINT32, BASE_DEC, NULL, + 0x0, NULL, HFILL } }, + { &hf_homa_data_cutoff_version, + { "Homa cutoff version", "homa.cutoff_version", FT_UINT16, + BASE_DEC, NULL, 0x0, NULL, HFILL } }, + { &hf_homa_data_retransmit, + { "Homa retransmit", "homa.retransmit", FT_UINT8, BASE_DEC, + NULL, 0x0, NULL, HFILL } }, + { &hf_homa_data_offset, + { "Homa segment offset", "homa.offset", FT_UINT32, BASE_DEC, + NULL, 0x0, NULL, HFILL } }, + { &hf_homa_data_segment_length, + { "Homa segment length", "homa.segment_length", FT_UINT32, + BASE_DEC, NULL, 0x0, NULL, HFILL } } + }; + static hf_register_info hf_homa_ack[] = { + { &hf_homa_ack_client_id, + { "Homa client id", "homa.client_id", FT_UINT64, BASE_DEC, + NULL, 0x0, NULL, HFILL } }, + { &hf_homa_ack_client_port, + { "Homa client port", "homa.client_port", FT_UINT16, BASE_DEC, + NULL, 0x0, NULL, HFILL } }, + { &hf_homa_ack_server_port, + { "Homa server port", "homa.server_port", FT_UINT16, BASE_DEC, + NULL, 0x0, NULL, HFILL } } + }; + static hf_register_info hf_grant[] = { + { &hf_homa_grant_offset, + { "Homa grant offset", "homa.grant_offset", FT_UINT32, + BASE_DEC, NULL, 0x0, NULL, HFILL } }, + { &hf_homa_grant_priority, + { "Homa grant priority", "homa.grant_priority", FT_UINT8, + BASE_DEC, NULL, 0x0, NULL, HFILL } } + }; + static hf_register_info hf_resend[] = { + { &hf_homa_resend_offset, + { "Homa resend offset", "homa.resend_offset", FT_UINT32, + BASE_DEC, NULL, 0x0, NULL, HFILL } }, + { &hf_homa_resend_length, + { "Homa resend length", "homa.resend_length", FT_UINT32, + BASE_DEC, NULL, 0x0, NULL, HFILL } }, + { &hf_homa_resend_priority, + { "Homa resend priority", "homa.resend_priority", FT_UINT8, + BASE_DEC, NULL, 0x0, NULL, HFILL } } + }; + + static hf_register_info hf_header_ack[] = { + { &hf_homa_ack_num_acks, + { "Homa number of acks", "homa.num_acks", FT_UINT16, BASE_DEC, + NULL, 0x0, NULL, HFILL } }, + }; + + static hf_register_info hf_cutoffs[] = { + { &hf_homa_cutoff_unsched_cutoffs, + { "Homa unscheduled cutoffs", "homa.unsched_cutoffs", + FT_UINT32, BASE_DEC, NULL, 0x0, NULL, HFILL } }, + { &hf_homa_cutoff_version, + { "Homa cutoff version", "homa.cutoff.cutoff_version", + FT_UINT16, BASE_DEC, NULL, 0x0, NULL, HFILL } }, + }; + + /* Setup protocol subtree array */ + static int *ett[] = { &ett_homa_common }; + + proto_homa = proto_register_protocol("Homa Protocol", /* name */ + "Homa", /* short_name */ + "homa" /* filter_name */ + ); + + proto_register_field_array(proto_homa, hf_common, + array_length(hf_common)); + proto_register_field_array(proto_homa, hf_data, array_length(hf_data)); + proto_register_field_array(proto_homa, hf_homa_ack, + array_length(hf_homa_ack)); + proto_register_field_array(proto_homa, hf_grant, + array_length(hf_grant)); + proto_register_field_array(proto_homa, hf_resend, + array_length(hf_resend)); + proto_register_field_array(proto_homa, hf_header_ack, + array_length(hf_header_ack)); + proto_register_field_array(proto_homa, hf_cutoffs, + array_length(hf_cutoffs)); + proto_register_subtree_array(ett, array_length(ett)); +} + +static void proto_reg_handoff_homa(void) +{ + static dissector_handle_t homa_handle; + + homa_handle = create_dissector_handle(dissect_homa, proto_homa); + dissector_add_uint("ip.proto", HOMA_PROTO, homa_handle); +} + +WS_DLL_PUBLIC void plugin_register(void) +{ + static proto_plugin protoPlugin; + + protoPlugin.register_protoinfo = proto_register_homa; + protoPlugin.register_handoff = proto_reg_handoff_homa; + proto_register_plugin(&protoPlugin); +} diff --git a/homa.h b/homa.h index 49238ad7..5e5505fe 100644 --- a/homa.h +++ b/homa.h @@ -1,24 +1,11 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ WITH Linux-syscall-note */ /* This file defines the kernel call interface for the Homa * transport protocol. */ -#ifndef _HOMA_H -#define _HOMA_H +#ifndef _UAPI_LINUX_HOMA_H +#define _UAPI_LINUX_HOMA_H #include #ifndef __KERNEL__ @@ -26,15 +13,8 @@ #include #endif -#ifdef __cplusplus -extern "C" -{ -#endif - -/* Homa's protocol number within the IP protocol space (this is not an - * officially allocated slot). - */ -#define IPPROTO_HOMA 0xFD +/* IANA-assigned Internet Protocol number for Homa. */ +#define IPPROTO_HOMA 146 /** * define HOMA_MAX_MESSAGE_LENGTH - Maximum bytes of payload in a Homa @@ -46,34 +26,24 @@ extern "C" * define HOMA_BPAGE_SIZE - Number of bytes in pages used for receive * buffers. Must be power of two. */ -#define HOMA_BPAGE_SHIFT 16 #define HOMA_BPAGE_SIZE (1 << HOMA_BPAGE_SHIFT) +#define HOMA_BPAGE_SHIFT 16 /** - * define HOMA_MAX_BPAGES: The largest number of bpages that will be required + * define HOMA_MAX_BPAGES - The largest number of bpages that will be required * to store an incoming message. */ -#define HOMA_MAX_BPAGES ((HOMA_MAX_MESSAGE_LENGTH + HOMA_BPAGE_SIZE - 1) \ - >> HOMA_BPAGE_SHIFT) +#define HOMA_MAX_BPAGES ((HOMA_MAX_MESSAGE_LENGTH + HOMA_BPAGE_SIZE - 1) >> \ + HOMA_BPAGE_SHIFT) /** - * define HOMA_MIN_DEFAULT_PORT - The 16-bit port space is divided into + * define HOMA_MIN_DEFAULT_PORT - The 16 bit port space is divided into * two nonoverlapping regions. Ports 1-32767 are reserved exclusively * for well-defined server ports. The remaining ports are used for client * ports; these are allocated automatically by Homa. Port 0 is reserved. */ #define HOMA_MIN_DEFAULT_PORT 0x8000 -/** - * Holds either an IPv4 or IPv6 address (smaller and easier to use than - * sockaddr_storage). - */ -typedef union sockaddr_in_union { - struct sockaddr sa; - struct sockaddr_in in4; - struct sockaddr_in6 in6; -} sockaddr_in_union; - /** * struct homa_sendmsg_args - Provides information needed by Homa's * sendmsg; passed to sendmsg using the msg_control field. @@ -85,154 +55,281 @@ struct homa_sendmsg_args { * id. If the message is a request, then the value is modified to * hold the id of the new RPC. */ - uint64_t id; + __u64 id; /** * @completion_cookie: (in) Used only for request messages; will be * returned by recvmsg when the RPC completes. Typically used to * locate app-specific info about the RPC. */ - uint64_t completion_cookie; + __u64 completion_cookie; + + /** + * @flags: (in) OR-ed combination of bits that control the operation. + * See below for values. + */ + __u32 flags; + + /** @reserved: Not currently used, must be 0. */ + __u32 reserved; }; -#if !defined(__cplusplus) -_Static_assert(sizeof(struct homa_sendmsg_args) >= 16, - "homa_sendmsg_args shrunk"); -_Static_assert(sizeof(struct homa_sendmsg_args) <= 16, - "homa_sendmsg_args grew"); -#endif + +/* Flag bits for homa_sendmsg_args.flags (see man page for documentation): + */ +#define HOMA_SENDMSG_PRIVATE 0x01 +#define HOMA_SENDMSG_VALID_FLAGS 0x01 /** * struct homa_recvmsg_args - Provides information needed by Homa's * recvmsg; passed to recvmsg using the msg_control field. */ struct homa_recvmsg_args { - /** - * @id: (in/out) Initially specifies the id of the desired RPC, or 0 - * if any RPC is OK; returns the actual id received. + * @id: (in/out) Initial value is 0 to wait for any shared RPC; + * nonzero means wait for that specific (private) RPC. Returns + * the id of the RPC received. */ - uint64_t id; + __u64 id; /** * @completion_cookie: (out) If the incoming message is a response, * this will return the completion cookie specified when the * request was sent. For requests this will always be zero. */ - uint64_t completion_cookie; - - /** - * @flags: (in) OR-ed combination of bits that control the operation. - * See below for values. - */ - int flags; + __u64 completion_cookie; /** * @num_bpages: (in/out) Number of valid entries in @bpage_offsets. * Passes in bpages from previous messages that can now be * recycled; returns bpages from the new message. */ - uint32_t num_bpages; + __u32 num_bpages; - uint32_t _pad[2]; + /** @reserved: Not currently used, must be 0. */ + __u32 reserved; /** * @bpage_offsets: (in/out) Each entry is an offset into the buffer - * region for the socket pool. When returned from recvmsg, the - * offsets indicate where fragments of the new message are stored. All - * entries but the last refer to full buffer pages (HOMA_BPAGE_SIZE bytes) - * and are bpage-aligned. The last entry may refer to a bpage fragment and - * is not necessarily aligned. The application now owns these bpages and - * must eventually return them to Homa, using bpage_offsets in a future - * recvmsg invocation. - */ - uint32_t bpage_offsets[HOMA_MAX_BPAGES]; + * region for the socket pool. When returned from recvmsg, the + * offsets indicate where fragments of the new message are stored. All + * entries but the last refer to full buffer pages (HOMA_BPAGE_SIZE + * bytes) and are bpage-aligned. The last entry may refer to a bpage + * fragment and is not necessarily aligned. The application now owns + * these bpages and must eventually return them to Homa, using + * bpage_offsets in a future recvmsg invocation. + */ + __u32 bpage_offsets[HOMA_MAX_BPAGES]; }; -#if !defined(__cplusplus) -_Static_assert(sizeof(struct homa_recvmsg_args) >= 96, - "homa_recvmsg_args shrunk"); -_Static_assert(sizeof(struct homa_recvmsg_args) <= 96, - "homa_recvmsg_args grew"); -#endif - -/* Flag bits for homa_recvmsg_args.flags (see man page for documentation): - */ -#define HOMA_RECVMSG_REQUEST 0x01 -#define HOMA_RECVMSG_RESPONSE 0x02 -#define HOMA_RECVMSG_NONBLOCKING 0x04 -#define HOMA_RECVMSG_VALID_FLAGS 0x07 +#ifndef __STRIP__ /* See strip.py */ /** * struct homa_abort_args - Structure that passes arguments and results * between user space and the HOMAIOCABORT ioctl. */ struct homa_abort_args { /** @id: Id of RPC to abort, or zero to abort all RPCs on socket. */ - uint64_t id; + __u64 id; /** * @error: Zero means destroy and free RPCs; nonzero means complete * them with this error (recvmsg will return the RPCs). */ - int error; + __u32 error; + + /** @_pad1: Reserved. */ + __u32 _pad1; - int _pad1; - uint64_t _pad2[2]; + /** @_pad2: Reserved. */ + __u64 _pad2[2]; }; -#if !defined(__cplusplus) -_Static_assert(sizeof(struct homa_abort_args) >= 32, "homa_abort_args shrunk"); -_Static_assert(sizeof(struct homa_abort_args) <= 32, "homa_abort_args grew"); -#endif +#endif /* See strip.py */ + +/** define SO_HOMA_RCVBUF: setsockopt option for specifying buffer region. */ +#define SO_HOMA_RCVBUF 10 -/** define SO_HOMA_SET_BUF: setsockopt option for specifying buffer region. */ -#define SO_HOMA_SET_BUF 10 +/** + * define SO_HOMA_SERVER: setsockopt option for specifying whether a + * socket will act as server. + */ +#define SO_HOMA_SERVER 11 -/** struct homa_set_buf - setsockopt argument for SO_HOMA_SET_BUF. */ -struct homa_set_buf_args { - /** @start: First byte of buffer region. */ - void *start; +/** struct homa_rcvbuf_args - setsockopt argument for SO_HOMA_RCVBUF. */ +struct homa_rcvbuf_args { + /** @start: Address of first byte of buffer region in user space. */ + __u64 start; /** @length: Total number of bytes available at @start. */ size_t length; }; -/** - * Meanings of the bits in Homa's flag word, which can be set using +/* Meanings of the bits in Homa's flag word, which can be set using * "sysctl /net/homa/flags". */ /** - * Disable the output throttling mechanism: always send all packets - * immediately. + * define HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism + * (always send all packets immediately). */ #define HOMA_FLAG_DONT_THROTTLE 2 /** - * I/O control calls on Homa sockets. These are mapped into the - * SIOCPROTOPRIVATE range of 0x89e0 through 0x89ef. + * struct homa_rpc_info - Used by HOMAIOCINFO to return information about + * a specific RPC. */ +struct homa_rpc_info { + /** + * @id: Identifier for the RPC, unique among all RPCs sent by the + * client node. If the low-order bit is 1, this node is the server + * for the RPC; 0 means we are the client. + */ + __u64 id; -#define HOMAIOCREPLY _IOWR(0x89, 0xe2, struct homa_reply_args) -#define HOMAIOCABORT _IOWR(0x89, 0xe3, struct homa_abort_args) -#define HOMAIOCFREEZE _IO(0x89, 0xef) - -extern int homa_abortp(int fd, struct homa_abort_args *args); - -extern int homa_send(int sockfd, const void *message_buf, - size_t length, const sockaddr_in_union *dest_addr, - uint64_t *id, uint64_t completion_cookie); -extern int homa_sendv(int sockfd, const struct iovec *iov, - int iovcnt, const sockaddr_in_union *dest_addr, - uint64_t *id, uint64_t completion_cookie); -extern ssize_t homa_reply(int sockfd, const void *message_buf, - size_t length, const sockaddr_in_union *dest_addr, - uint64_t id); -extern ssize_t homa_replyv(int sockfd, const struct iovec *iov, - int iovcnt, const sockaddr_in_union *dest_addr, - uint64_t id); -extern int homa_abort(int sockfd, uint64_t id, int error); - -#ifdef __cplusplus -} -#endif + /** @peer: Address of the peer socket for this RPC. */ + union { + struct sockaddr_storage storage; + struct sockaddr_in in4; + struct sockaddr_in6 in6; + } peer; + + /** + * @completion_cookie: For client-side RPCs this gives the completion + * cookie specified when the RPC was initiated. For server-side RPCs + * this is zero. + */ + __u64 completion_cookie; + + /** + * @tx_length: Length of the outgoing message in bytes, or -1 if + * the sendmsg hasn't yet been called. + */ + __s32 tx_length; + + /** + * @tx_sent: Number of bytes of the outgoing message that have been + * transmitted at least once. + */ + __u32 tx_sent; + + /** + * @tx_granted: Number of bytes of the outgoing message that the + * receiver has authorized us to transmit (includes unscheduled + * bytes). + */ + __u32 tx_granted; + +#ifndef __STRIP__ /* See strip.py */ + /** + * @tx_prio: Current priority level that the receiver has specified + * for transmitting packets. + */ + __u32 tx_prio; +#else /* See strip.py */ + /** @reserved: Reserved for future use. */ + __u32 reserved; +#endif /* See strip.py */ + + /** + * @rx_length: Length of the incoming message, in bytes. -1 means + * the length is not yet known (this is a client-side RPC and + * no packets have been received). + */ + __s32 rx_length; + + /** + * @rx_remaining: Number of bytes in the incoming message that have + * not yet been received. + */ + __u32 rx_remaining; + + /** + * @rx_gaps: The number of gaps in the incoming message. A gap is + * a range of bytes that have not been received yet, but bytes after + * the gap have been received. + */ + __u32 rx_gaps; + + /** + * @rx_gap_bytes: The total number of bytes in gaps in the incoming + * message. + */ + __u32 rx_gap_bytes; + + /** + * @rx_granted: The number of bytes in the message that the sender + * is authorized to transmit (includes unscheduled bytes). + */ + __u32 rx_granted; + + /** + * @flags: Various single-bit values associated with the RPC: + * HOMA_RPC_BUF_STALL: The incoming message is currently stalled + * because there is insufficient receiver buffer + * space. + * HOMA_RPC_PRIVATE: The RPC has been created as "private"; set + * only on the client side. + * HOMA_RPC_RX_READY: The incoming message is complete and has + * been queued waiting for a thread to call + * recvmsg. + * HOMA_RPC_RX_COPY: There are packets that have been received, + * whose data has not yet been copied from + * packet buffers to user space. + */ + __u16 flags; +#define HOMA_RPC_BUF_STALL 1 +#define HOMA_RPC_PRIVATE 2 +#define HOMA_RPC_RX_READY 4 +#define HOMA_RPC_RX_COPY 8 +}; + +/** + * struct homa_info - In/out argument passed to HOMAIOCINFO. Fields labeled + * as "in" must be set by the application; other fields are returned to the + * application from the kernel. + */ +struct homa_info { + /** + * @rpc_info: (in) Address of memory region in which to store + * information about individual RPCs. + */ + struct homa_rpc_info *rpc_info; + + /** + * @rpc_info_length: (in) Number of bytes of storage available at + * rpc_info. + */ + size_t rpc_info_length; + + /** + * @bpool_avail_bytes: Number of bytes in the buffer pool for incoming + * messages that is currently available for new messages. + */ + __u64 bpool_avail_bytes; + + /** @port: Port number handled by this socket. */ + __u32 port; + + /** + * @num_rpcs: Total number of active RPCs (both server and client) for + * this socket. The number stored at @rpc_info will be less than this + * if @rpc_info_length is too small. + */ + __u32 num_rpcs; + + /** + * @error_msg: Provides additional information about the last error + * returned by a Homa-related kernel call such as sendmsg, recvmsg, + * or ioctl. Not updated for some obvious return values such as EINTR + * or EWOULDBLOCK. + */ +#define HOMA_ERROR_MSG_SIZE 100 + char error_msg[HOMA_ERROR_MSG_SIZE]; +}; + +/* I/O control calls on Homa sockets.*/ +#define HOMAIOCINFO _IOWR('h', 1, struct homa_info) +#ifndef __STRIP__ /* See strip.py */ +#define HOMAIOCABORT _IOWR('h', 2, struct homa_abort_args) +#define HOMAIOCFREEZE _IO('h', 3) +#endif /* See strip.py */ -#endif /* _HOMA_H */ +#endif /* _UAPI_LINUX_HOMA_H */ diff --git a/homa_api.c b/homa_api.c deleted file mode 100644 index b6ef234d..00000000 --- a/homa_api.c +++ /dev/null @@ -1,219 +0,0 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -/* This file contains functions that implement the Homa API visible to - * applications. It is intended to be part of the user-level run-time library. - */ - -#include -#include -#include -#include -#ifndef NDEBUG -#include -#endif -#include -#include - -#include "homa.h" - -/** - * homa_reply() - Send a response message for an RPC previously received - * with a call to recvmsg. - * @sockfd: File descriptor for the socket on which to send the message. - * @response: First byte of buffer containing the response message. - * @resplen: Number of bytes at @response. - * @dest_addr: Address of the RPC's client (returned by recvmsg when - * the message was received). - * @id: Unique identifier for the request, as returned by recvmsg - * when the request was received. - * - * @dest_addr and @id must correspond to a previously-received request - * for which no reply has yet been sent; if there is no such active request, - * then this function does nothing. - * - * Return: 0 means the response has been accepted for delivery. If an - * error occurred, -1 is returned and errno is set appropriately. - */ -ssize_t homa_reply(int sockfd, const void *message_buf, size_t length, - const sockaddr_in_union *dest_addr, uint64_t id) -{ - struct homa_sendmsg_args args; - struct iovec vec; - struct msghdr hdr; - int result; - - args.id = id; - args.completion_cookie = 0; - - vec.iov_base = (void *) message_buf; - vec.iov_len = length; - - hdr.msg_name = (void *) dest_addr; - hdr.msg_namelen = sizeof(*dest_addr); - hdr.msg_iov = &vec; - hdr.msg_iovlen = 1; - hdr.msg_control = &args; - hdr.msg_controllen = 0; - result = sendmsg(sockfd, &hdr, 0); - return result; -} - -/** - * homa_replyv() - Similar to homa_reply, except the response - * message can be divided among several chunks of memory. - * @sockfd: File descriptor for the socket on which to send the message. - * @iov: Pointer to array that describes the chunks of the response - * message. - * @iovcnt: Number of elements in @iov. - * @dest_addr: Address of the RPC's client (returned by recvmsg when - * the message was received). - * @id: Unique identifier for the request, as returned by recvmsg - * when the request was received. - * - * @dest_addr and @id must correspond to a previously-received request - * for which no reply has yet been sent; if there is no such active request, - * then this function does nothing. - * - * Return: 0 means the response has been accepted for delivery. If an - * error occurred, -1 is returned and errno is set appropriately. - */ -ssize_t homa_replyv(int sockfd, const struct iovec *iov, int iovcnt, - const sockaddr_in_union *dest_addr, uint64_t id) -{ - struct homa_sendmsg_args args; - struct msghdr hdr; - int result; - - args.id = id; - args.completion_cookie = 0; - - hdr.msg_name = (void *) dest_addr; - hdr.msg_namelen = sizeof(*dest_addr); - hdr.msg_iov = (struct iovec *) iov; - hdr.msg_iovlen = iovcnt; - hdr.msg_control = &args; - hdr.msg_controllen = 0; - result = sendmsg(sockfd, &hdr, 0); - return result; -} - -/** - * homa_send() - Send a request message to initiate an RPC. - * @sockfd: File descriptor for the socket on which to send the - * message. - * @message_buf: First byte of buffer containing the request message. - * @length: Number of bytes at @message_buf. - * @dest_addr: Address of server to which the request should be sent. - * @id: A unique identifier for the request will be returned - * here; this can be used later to find the response for - * this request. - * @completion_cookie: Value to be returned by recvmsg when RPC completes. - * - * Return: 0 means the request has been accepted for delivery. If an - * error occurred, -1 is returned and errno is set appropriately. - */ -int homa_send(int sockfd, const void *message_buf, size_t length, - const sockaddr_in_union *dest_addr, uint64_t *id, - uint64_t completion_cookie) -{ - struct homa_sendmsg_args args; - struct iovec vec; - struct msghdr hdr; - int result; - - args.id = 0; - args.completion_cookie = completion_cookie; - - vec.iov_base = (void *) message_buf; - vec.iov_len = length; - - hdr.msg_name = (void *) dest_addr; - /* For some unknown reason, this change improves short-message P99 - * latency by 20% in W3 under IPv4 (as of December 2022). - */ -// hdr.msg_namelen = sizeof(*dest_addr); - hdr.msg_namelen = dest_addr->in4.sin_family == AF_INET ? - sizeof(dest_addr->in4) : sizeof(dest_addr->in6); - hdr.msg_iov = &vec; - hdr.msg_iovlen = 1; - hdr.msg_control = &args; - hdr.msg_controllen = 0; - result = sendmsg(sockfd, &hdr, 0); - if ((result >= 0) && (id != NULL)) - *id = args.id; - return result; -} - -/** - * homa_sendv() - Same as homa_send, except that the request message can - * be divided among multiple disjoint chunks of memory. - * @sockfd: File descriptor for the socket on which to send the - * message. - * @iov: Pointer to array that describes the chunks of the request - * message. - * @iovcnt: Number of elements in @iov. - * @dest_addr: Address of server to which the request should be sent. - * @id: A unique identifier for the request will be returned - * here; this can be used later to find the response for - * this request. - * @completion_cookie: Value to be returned by recvmsg when RPC completes. - * - * Return: 0 means the request has been accepted for delivery. If an - * error occurred, -1 is returned and errno is set appropriately. - */ -int homa_sendv(int sockfd, const struct iovec *iov, int iovcnt, - const sockaddr_in_union *dest_addr, uint64_t *id, - uint64_t completion_cookie) -{ - struct homa_sendmsg_args args; - struct msghdr hdr; - int result; - - args.id = 0; - args.completion_cookie = completion_cookie; - - hdr.msg_name = (void *) dest_addr; - hdr.msg_namelen = sizeof(*dest_addr); - hdr.msg_iov = (struct iovec *) iov; - hdr.msg_iovlen = iovcnt; - hdr.msg_control = &args; - hdr.msg_controllen = 0; - result = sendmsg(sockfd, &hdr, 0); - if ((result >= 0) && (id != NULL)) - *id = args.id; - return result; -} - -/** - * homa_abort() - Terminate the execution of an RPC. - * @sockfd: File descriptor for the socket associated with the RPC. - * @id: Unique identifier for a client RPC to abort (return value - * from previous call to homa_send). 0 means abort all client - * RPCs on this socket. - * @error: 0 means that the aborted RPCs should be destroyed - * immediately (they will never be returned by recvmsg). - * Nonzero means that the RPCs should be moved to the - * completed state; recvmsg will return an error for these - * RPCs, with @error as the errno value. - * - * Return: If an error occurred, -1 is returned and errno is set - * appropriately. Otherwise zero is returned. - */ -int homa_abort(int sockfd, uint64_t id, int error) -{ - struct homa_abort_args args = {id, error}; - return ioctl(sockfd, HOMAIOCABORT, &args); -} diff --git a/homa_devel.c b/homa_devel.c new file mode 100644 index 00000000..272aede9 --- /dev/null +++ b/homa_devel.c @@ -0,0 +1,1271 @@ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +/* This file contains functions that are useful to have in Homa during + * development, but aren't needed in production versions. + */ + +#include "homa_impl.h" +#include "homa_devel.h" +#include "homa_grant.h" +#include "homa_peer.h" +#include "homa_rpc.h" +#ifndef __STRIP__ /* See strip.py */ +#include "homa_qdisc.h" +#include "homa_skb.h" +#else /* See strip.py */ +#include "homa_stub.h" +#endif /* See strip.py */ +#include "homa_wire.h" + +#include + +#ifndef __STRIP__ /* See strip.py */ +/* homa_drop_packet will accept this many more packets before it drops some. */ +static int accept_count; + +/* If accept_count <= 0, homa_drop_packet will drop this many packets + * before it starts accepting again. + */ +static int drop_count; + +/* Used for random-number generation. */ +static u32 seed; +#endif /* See strip.py */ + +/* Used to record a history of rx state. */ +#define MAX_RX_SNAPSHOTS 1000 +static struct homa_rpc_snapshot rpc_snapshots[MAX_RX_SNAPSHOTS]; +static int next_snapshot; + +/* homa_clock() time when most recent rx snapshot was taken. */ +u64 snapshot_time; + +/* Interval between rx snapshots in ms. */ +#define RX_SNAPSHOT_INTERVAL 20 + +/* Interval between rx snapshots, in homa_clock() units. */ +u64 snapshot_interval; + +/** + * homa_print_ipv4_addr() - Convert an IPV4 address to the standard string + * representation. + * @addr: Address to convert, in network byte order. + * + * Return: The converted value. Values are stored in static memory, so + * the caller need not free. This also means that storage is + * eventually reused (there are enough buffers to accommodate + * multiple "active" values). + * + * Note: Homa uses this function, rather than the %pI4 format specifier + * for snprintf et al., because the kernel's version of snprintf isn't + * available in Homa's unit test environment. + */ +char *homa_print_ipv4_addr(__be32 addr) +{ +#define NUM_BUFS_IPV4 4 +#define BUF_SIZE_IPV4 30 + static char buffers[NUM_BUFS_IPV4][BUF_SIZE_IPV4]; + u32 a2 = ntohl(addr); + static int next_buf; + char *buffer; + + buffer = buffers[next_buf]; + next_buf++; + if (next_buf >= NUM_BUFS_IPV4) + next_buf = 0; + snprintf(buffer, BUF_SIZE_IPV4, "%u.%u.%u.%u", (a2 >> 24) & 0xff, + (a2 >> 16) & 0xff, (a2 >> 8) & 0xff, a2 & 0xff); + return buffer; +} + +/** + * homa_print_ipv6_addr() - Convert an IPv6 address to a human-readable string + * representation. IPv4-mapped addresses are printed in IPv4 syntax. + * @addr: Address to convert, in network byte order. + * + * Return: The converted value. Values are stored in static memory, so + * the caller need not free. This also means that storage is + * eventually reused (there are enough buffers to accommodate + * multiple "active" values). + */ +char *homa_print_ipv6_addr(const struct in6_addr *addr) +{ +#define NUM_BUFS BIT(2) +#define BUF_SIZE 64 + static char buffers[NUM_BUFS][BUF_SIZE]; + static int next_buf; + char *buffer; + + buffer = buffers[next_buf]; + next_buf++; + if (next_buf >= NUM_BUFS) + next_buf = 0; +#ifdef __UNIT_TEST__ + struct in6_addr zero = {}; + + if (ipv6_addr_equal(addr, &zero)) { + snprintf(buffer, BUF_SIZE, "0.0.0.0"); + } else if ((addr->s6_addr32[0] == 0) && + (addr->s6_addr32[1] == 0) && + (addr->s6_addr32[2] == htonl(0x0000ffff))) { + u32 a2 = ntohl(addr->s6_addr32[3]); + + snprintf(buffer, BUF_SIZE, "%u.%u.%u.%u", (a2 >> 24) & 0xff, + (a2 >> 16) & 0xff, (a2 >> 8) & 0xff, a2 & 0xff); + } else { + const char *inet_ntop(int af, const void *src, char *dst, + size_t size); + inet_ntop(AF_INET6, addr, buffer + 1, BUF_SIZE); + buffer[0] = '['; + strcat(buffer, "]"); + } +#else + snprintf(buffer, BUF_SIZE, "%pI6", addr); +#endif + return buffer; +} + +/** + * homa_print_packet() - Print a human-readable string describing the + * information in a Homa packet. + * @skb: Packet whose information should be printed. + * @buffer: Buffer in which to generate the string. + * @buf_len: Number of bytes available at @buffer. + * + * Return: @buffer + */ +char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) +{ + struct homa_common_hdr *common; + char header[HOMA_MAX_HEADER]; + struct in6_addr saddr; + int used = 0; + + if (!skb) { + snprintf(buffer, buf_len, "skb is NULL!"); + buffer[buf_len - 1] = 0; + return buffer; + } + + homa_skb_get(skb, &header, 0, sizeof(header)); + common = (struct homa_common_hdr *)header; + saddr = skb_canonical_ipv6_saddr(skb); + used = homa_snprintf(buffer, buf_len, used, + "%s from %s:%u, dport %d, id %llu", + homa_symbol_for_type(common->type), + homa_print_ipv6_addr(&saddr), + ntohs(common->sport), ntohs(common->dport), + be64_to_cpu(common->sender_id)); + switch (common->type) { + case DATA: { + struct homa_skb_info *homa_info = homa_get_skb_info(skb); + struct homa_data_hdr *h = (struct homa_data_hdr *)header; + int data_left, i, seg_length, pos, offset; + + if (skb_shinfo(skb)->gso_segs == 0) { + seg_length = homa_data_len(skb); + data_left = 0; + } else { + seg_length = homa_info->seg_length; + if (seg_length > homa_info->data_bytes) + seg_length = homa_info->data_bytes; + data_left = homa_info->data_bytes - seg_length; + } + offset = homa_get_offset(h); +#ifndef __STRIP__ /* See strip.py */ + used = homa_snprintf(buffer, buf_len, used, + ", message_length %d, offset %d, data_length %d, incoming %d", + ntohl(h->message_length), offset, + seg_length, ntohl(h->incoming)); + if (ntohs(h->cutoff_version) != 0) + used = homa_snprintf(buffer, buf_len, used, + ", cutoff_version %d", + ntohs(h->cutoff_version)); +#else /* See strip.py */ + used = homa_snprintf(buffer, buf_len, used, + ", message_length %d, offset %d, data_length %d", + ntohl(h->message_length), offset, + seg_length); +#endif /* See strip.py */ + if (h->retransmit) + used = homa_snprintf(buffer, buf_len, used, + ", RETRANSMIT"); + if (skb_shinfo(skb)->gso_type == 0xd) + used = homa_snprintf(buffer, buf_len, used, + ", TSO disabled"); + if (skb_shinfo(skb)->gso_segs <= 1) + break; + pos = skb_transport_offset(skb) + sizeof(*h) + seg_length; + used = homa_snprintf(buffer, buf_len, used, ", extra segs"); + for (i = skb_shinfo(skb)->gso_segs - 1; i > 0; i--) { + if (homa_info->seg_length < skb_shinfo(skb)->gso_size) { + struct homa_seg_hdr seg; + + homa_skb_get(skb, &seg, pos, sizeof(seg)); + offset = ntohl(seg.offset); + } else { + offset += seg_length; + } + if (seg_length > data_left) + seg_length = data_left; + used = homa_snprintf(buffer, buf_len, used, + " %d@%d", seg_length, offset); + data_left -= seg_length; + pos += skb_shinfo(skb)->gso_size; + }; + break; + } +#ifndef __STRIP__ /* See strip.py */ + case GRANT: { + struct homa_grant_hdr *h = (struct homa_grant_hdr *)header; + + used = homa_snprintf(buffer, buf_len, used, + ", offset %d, grant_prio %u", + ntohl(h->offset), h->priority); + break; + } +#endif /* See strip.py */ + case RESEND: { + struct homa_resend_hdr *h = (struct homa_resend_hdr *)header; + +#ifndef __STRIP__ /* See strip.py */ + used = homa_snprintf(buffer, buf_len, used, + ", offset %d, length %d, resend_prio %u", + ntohl(h->offset), ntohl(h->length), + h->priority); +#else /* See strip.py */ + used = homa_snprintf(buffer, buf_len, used, + ", offset %d, length %d", + ntohl(h->offset), ntohl(h->length)); +#endif /* See strip.py */ + break; + } + case RPC_UNKNOWN: + /* Nothing to add here. */ + break; + case BUSY: + /* Nothing to add here. */ + break; +#ifndef __STRIP__ /* See strip.py */ + case CUTOFFS: { + struct homa_cutoffs_hdr *h = (struct homa_cutoffs_hdr *)header; + + used = homa_snprintf(buffer, buf_len, used, + ", cutoffs %d %d %d %d %d %d %d %d, version %u", + ntohl(h->unsched_cutoffs[0]), + ntohl(h->unsched_cutoffs[1]), + ntohl(h->unsched_cutoffs[2]), + ntohl(h->unsched_cutoffs[3]), + ntohl(h->unsched_cutoffs[4]), + ntohl(h->unsched_cutoffs[5]), + ntohl(h->unsched_cutoffs[6]), + ntohl(h->unsched_cutoffs[7]), + ntohs(h->cutoff_version)); + break; + } + case FREEZE: + /* Nothing to add here. */ + break; +#endif /* See strip.py */ + case NEED_ACK: + /* Nothing to add here. */ + break; + case ACK: { + struct homa_ack_hdr *h = (struct homa_ack_hdr *)header; + int i, count; + + count = ntohs(h->num_acks); + used = homa_snprintf(buffer, buf_len, used, ", acks"); + for (i = 0; i < count; i++) { + used = homa_snprintf(buffer, buf_len, used, + " [sp %d, id %llu]", + ntohs(h->acks[i].server_port), + be64_to_cpu(h->acks[i].client_id)); + } + break; + } + } + + buffer[buf_len - 1] = 0; + return buffer; +} + +/** + * homa_print_packet_short() - Print a human-readable string describing the + * information in a Homa packet. This function generates a shorter + * description than homa_print_packet. + * @skb: Packet whose information should be printed. + * @buffer: Buffer in which to generate the string. + * @buf_len: Number of bytes available at @buffer. + * + * Return: @buffer + */ +char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) +{ + struct homa_common_hdr *common; + char header[HOMA_MAX_HEADER]; + + common = (struct homa_common_hdr *)header; + homa_skb_get(skb, header, 0, HOMA_MAX_HEADER); + switch (common->type) { + case DATA: { + struct homa_data_hdr *h = (struct homa_data_hdr *)header; + struct homa_skb_info *homa_info = homa_get_skb_info(skb); + int data_left, used, i, seg_length, pos, offset; + + if (skb_shinfo(skb)->gso_segs == 0) { + seg_length = homa_data_len(skb); + data_left = 0; + } else { + seg_length = homa_info->seg_length; + data_left = homa_info->data_bytes - seg_length; + } + offset = homa_get_offset(h); + + pos = skb_transport_offset(skb) + sizeof(*h) + seg_length; + used = homa_snprintf(buffer, buf_len, 0, "DATA%s %d@%d", + h->retransmit ? " retrans" : "", + seg_length, offset); + for (i = skb_shinfo(skb)->gso_segs - 1; i > 0; i--) { + if (homa_info->seg_length < skb_shinfo(skb)->gso_size) { + struct homa_seg_hdr seg; + + homa_skb_get(skb, &seg, pos, sizeof(seg)); + offset = ntohl(seg.offset); + } else { + offset += seg_length; + } + if (seg_length > data_left) + seg_length = data_left; + used = homa_snprintf(buffer, buf_len, used, + " %d@%d", seg_length, offset); + data_left -= seg_length; + pos += skb_shinfo(skb)->gso_size; + } + break; + } +#ifndef __STRIP__ /* See strip.py */ + case GRANT: { + struct homa_grant_hdr *h = (struct homa_grant_hdr *)header; + + snprintf(buffer, buf_len, "GRANT %d@%d", ntohl(h->offset), + h->priority); + break; + } +#endif /* See strip.py */ + case RESEND: { + struct homa_resend_hdr *h = (struct homa_resend_hdr *)header; + +#ifndef __STRIP__ /* See strip.py */ + snprintf(buffer, buf_len, "RESEND %d-%d@%d", ntohl(h->offset), + ntohl(h->offset) + ntohl(h->length) - 1, + h->priority); +#else /* See strip.py */ + snprintf(buffer, buf_len, "RESEND %d-%d", ntohl(h->offset), + ntohl(h->offset) + ntohl(h->length) - 1); +#endif /* See strip.py */ + break; + } + case RPC_UNKNOWN: + snprintf(buffer, buf_len, "RPC_UNKNOWN"); + break; + case BUSY: + snprintf(buffer, buf_len, "BUSY"); + break; +#ifndef __STRIP__ /* See strip.py */ + case CUTOFFS: + snprintf(buffer, buf_len, "CUTOFFS"); + break; + case FREEZE: + snprintf(buffer, buf_len, "FREEZE"); + break; +#endif /* See strip.py */ + case NEED_ACK: + snprintf(buffer, buf_len, "NEED_ACK"); + break; + case ACK: + snprintf(buffer, buf_len, "ACK"); + break; + default: + snprintf(buffer, buf_len, "unknown packet type 0x%x", + common->type); + break; + } + return buffer; +} + +/** + * homa_freeze_peers() - Send FREEZE packets to all known peers in the + * root network namespace. + */ +void homa_freeze_peers(void) +{ + struct homa_socktab_scan scan; + struct homa_freeze_hdr freeze; + struct rhashtable_iter iter; + struct homa_peer *peer; + struct homa_sock *hsk; + struct homa_net *hnet; + int err; + + /* Find a socket to use (any socket for the namespace will do). */ + hnet = homa_net(&init_net); + rcu_read_lock(); + hsk = homa_socktab_start_scan(hnet->homa->socktab, &scan); + while (hsk && hsk->hnet != hnet) + hsk = homa_socktab_next(&scan); + homa_socktab_end_scan(&scan); + if (!hsk) { + tt_record("homa_freeze_peers couldn't find a socket"); + goto done; + } + + freeze.common.type = FREEZE; + freeze.common.sport = htons(hsk->port); + freeze.common.dport = 0; + IF_NO_STRIP(homa_set_hijack(&freeze.common)); + freeze.common.sender_id = 0; + + rhashtable_walk_enter(&hnet->homa->peertab->ht, &iter); + rhashtable_walk_start(&iter); + while (true) { + peer = rhashtable_walk_next(&iter); + if (!peer) + break; + if (IS_ERR(peer)) + /* Resize event occurred and walk will restart; + * that could result in duplicate freezes, but + * that's OK. + */ + continue; + if (peer->ht_key.hnet != hnet) + continue; + tt_record1("Sending freeze to 0x%x", tt_addr(peer->addr)); + err = __homa_xmit_control(&freeze, sizeof(freeze), peer, hsk); + if (err != 0) + tt_record2("homa_freeze_peers got error %d in xmit to 0x%x\n", + err, tt_addr(peer->addr)); + } + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); + +done: + rcu_read_unlock(); +} + +/** + * homa_snprintf() - This function makes it easy to use a series of calls + * to snprintf to gradually append information to a fixed-size buffer. + * If the buffer fills, the function can continue to be called, but nothing + * more will get added to the buffer. + * @buffer: Characters accumulate here. + * @size: Total space available in @buffer. + * @used: Number of bytes currently occupied in the buffer, not including + * a terminating null character; this is typically the result of + * the previous call to this function. + * @format: Format string suitable for passing to printf-like functions, + * followed by values for the various substitutions requested + * in @format + * @ ... + * + * Return: The number of characters now occupied in @buffer, not + * including the terminating null character. + */ +int homa_snprintf(char *buffer, int size, int used, const char *format, ...) +{ + int new_chars; + va_list ap; + + va_start(ap, format); + + if (used >= (size - 1)) + return used; + + new_chars = vsnprintf(buffer + used, size - used, format, ap); + if (new_chars < 0) + return used; + if (new_chars >= (size - used)) + return size - 1; + return used + new_chars; +} + +/** + * homa_symbol_for_state() - Returns a printable string describing an + * RPC state. + * @rpc: RPC whose state should be returned in printable form. + * + * Return: A static string holding the current state of @rpc. + */ +char *homa_symbol_for_state(struct homa_rpc *rpc) +{ + static char buffer[20]; + + switch (rpc->state) { + case RPC_OUTGOING: + return "OUTGOING"; + case RPC_INCOMING: + return "INCOMING"; + case RPC_IN_SERVICE: + return "IN_SERVICE"; + case RPC_DEAD: + return "DEAD"; + } + + /* See safety comment in homa_symbol_for_type. */ + snprintf(buffer, sizeof(buffer) - 1, "unknown(%u)", rpc->state); + buffer[sizeof(buffer) - 1] = 0; + return buffer; +} + +/** + * homa_symbol_for_type() - Returns a printable string describing a packet type. + * @type: A value from those defined by &homa_packet_type. + * + * Return: A static string holding the packet type corresponding to @type. + */ +char *homa_symbol_for_type(uint8_t type) +{ + switch (type) { + case DATA: + return "DATA"; +#ifndef __STRIP__ /* See strip.py */ + case GRANT: + return "GRANT"; +#endif /* See strip.py */ + case RESEND: + return "RESEND"; + case RPC_UNKNOWN: + return "RPC_UNKNOWN"; + case BUSY: + return "BUSY"; +#ifndef __STRIP__ /* See strip.py */ + case CUTOFFS: + return "CUTOFFS"; + case FREEZE: + return "FREEZE"; +#endif /* See strip.py */ + case NEED_ACK: + return "NEED_ACK"; + case ACK: + return "ACK"; + } + return "??"; +} + +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_freeze() - Freezes the timetrace if a particular kind of freeze + * has been requested through sysctl. + * @rpc: If we freeze our timetrace, we'll also send a freeze request + * to the peer for this RPC. + * @type: Condition that just occurred. If this doesn't match the + * externally set "freeze_type" value, then we don't freeze. + * @format: Format string used to generate a time trace record describing + * the reason for the freeze; must include "id %d, peer 0x%x" + */ +void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format) +{ + if (type != rpc->hsk->homa->freeze_type) + return; + rpc->hsk->homa->freeze_type = 0; + if (!atomic_read(&tt_frozen)) { +// struct homa_freeze_hdr freeze; + int dummy; + + pr_notice("freezing in %s with freeze_type %d\n", __func__, + type); + tt_record1("homa_freeze calling homa_rpc_log_active with freeze_type %d", type); + homa_rpc_log_active_tt(rpc->hsk->homa, 0); + homa_validate_incoming(rpc->hsk->homa, 1, &dummy); + pr_notice("%s\n", format); + tt_record2(format, rpc->id, tt_addr(rpc->peer->addr)); + tt_freeze(); +// homa_xmit_control(FREEZE, &freeze, sizeof(freeze), rpc); + homa_freeze_peers(); + } +} +#endif /* See strip.py */ + +/** + * homa_check_addr() - Verify that an address falls within the allowable + * range for kernel data. If not, crash the kernel. + * @p: Address to check. + */ +void homa_check_addr(void *p) +{ + uintptr_t addr = (uintptr_t)p; + + if ((addr & 0xffff800000000000) != 0xffff800000000000) { + pr_err("homa_check_addr received bogus address 0x%lx\n", addr); + tt_dbg1("foo"); + BUG_ON(1); + } +} + +/** + * homa_check_list() - Scan a list to make sure its pointer structure is + * not corrupted and that its length is bounded. Crashes the kernel if + * a problem is found. + * @list: Head of list to scan. + * @max_length: If the list has more than this many elements, it is + * assumed to have an internal loop. + */ +void homa_check_list(struct list_head *list, int max_length) +{ + struct list_head *p, *prev; + int num_elems; + + homa_check_addr(list->next); + homa_check_addr(list->prev); + prev = list; + for (p = list->next, num_elems = 0; ; p = p->next, num_elems++) { + if (p->prev != prev) { + pr_err("homa_check_list found bogus list structure: p->prev 0x%px, prev 0x%px\n", + p->prev, prev); + tt_dbg1("foo"); + BUG_ON(1); + } + if (p == list) + break; + if (num_elems > max_length) { + pr_err("homa_check_list found list with > %d elements\n", + max_length); + tt_dbg1("foo"); + BUG_ON(1); + } + homa_check_addr(p->next); + homa_check_addr(p->prev); + prev = p; + } +} + +/** + * homa_rpc_log() - Log info about a particular RPC; this is functionality + * pulled out of homa_rpc_log_active because its indentation got too deep. + * @rpc: RPC for which key info should be written to the system log. + */ +void homa_rpc_log(struct homa_rpc *rpc) +{ + char *type = homa_is_client(rpc->id) ? "Client" : "Server"; + char *peer = homa_print_ipv6_addr(&rpc->peer->addr); + + if (rpc->state == RPC_INCOMING) + pr_notice("%s RPC INCOMING, id %llu, peer %s:%d, %d/%d bytes received, incoming %d\n", + type, rpc->id, peer, rpc->dport, + rpc->msgin.length - rpc->msgin.bytes_remaining, +#ifndef __STRIP__ + rpc->msgin.length, rpc->msgin.granted); +#else + rpc->msgin.length, 0); +#endif /* __STRIP__ */ + else if (rpc->state == RPC_OUTGOING) { + pr_notice("%s RPC OUTGOING, id %llu, peer %s:%d, out length %d, left %d, granted %d, in left %d, resend_ticks %u, silent_ticks %d\n", + type, rpc->id, peer, rpc->dport, rpc->msgout.length, + rpc->msgout.length - rpc->msgout.next_xmit_offset, +#ifndef __STRIP__ + rpc->msgout.granted, rpc->msgin.bytes_remaining, +#else + 0, rpc->msgin.bytes_remaining, +#endif /* __STRIP__ */ + rpc->resend_timer_ticks, rpc->silent_ticks); + } else { + pr_notice("%s RPC %s, id %llu, peer %s:%d, incoming length %d, outgoing length %d\n", + type, homa_symbol_for_state(rpc), rpc->id, peer, + rpc->dport, rpc->msgin.length, rpc->msgout.length); + } +} + +/** + * homa_rpc_log_active() - Print information to the system log about all + * active RPCs. Intended primarily for debugging. + * @homa: Overall data about the Homa protocol implementation. + * @id: An RPC id: if nonzero, then only RPCs with this id will be + * logged. + */ +void homa_rpc_log_active(struct homa *homa, uint64_t id) +{ + struct homa_socktab_scan scan; + struct homa_sock *hsk; + struct homa_rpc *rpc; + int count = 0; + + pr_notice("Logging active Homa RPCs:\n"); + rcu_read_lock(); + for (hsk = homa_socktab_start_scan(homa->socktab, &scan); + hsk; hsk = homa_socktab_next(&scan)) { + if (list_empty(&hsk->active_rpcs) || hsk->shutdown) + continue; + + if (!homa_protect_rpcs(hsk)) + continue; + list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { + count++; + if (id != 0 && id != rpc->id) + continue; + homa_rpc_log(rpc); + } + homa_unprotect_rpcs(hsk); + } + homa_socktab_end_scan(&scan); + rcu_read_unlock(); + pr_notice("Finished logging active Homa RPCs: %d active RPCs\n", count); +} + +/** + * homa_rpc_log_tt() - Log info about a particular RPC using timetraces. + * @rpc: RPC for which key info should be written to the system log. + */ +void homa_rpc_log_tt(struct homa_rpc *rpc) +{ + if (rpc->state == RPC_INCOMING) { + int received = rpc->msgin.length + - rpc->msgin.bytes_remaining; + int rank; + + tt_record4("Incoming RPC id %d, peer 0x%x, %d/%d bytes received", + rpc->id, tt_addr(rpc->peer->addr), + received, rpc->msgin.length); +#ifndef __STRIP__ + tt_record3("RPC id %d has incoming %d, granted %d", rpc->id, + rpc->msgin.granted - received, rpc->msgin.granted); + rank = rpc->msgin.rank; +#else /* __STRIP__ */ + rank = -1; +#endif /* __STRIP__ */ + tt_record4("RPC id %d: length %d, remaining %d, rank %d", + rpc->id, rpc->msgin.length, + rpc->msgin.bytes_remaining, rank); + if (rpc->msgin.num_bpages == 0) { + tt_record1("RPC id %d is blocked waiting for buffers", + rpc->id); + } else { + struct sk_buff *skb = skb_peek(&rpc->msgin.packets); + + if (!skb) { + tt_record2("RPC id %d has %d bpages allocated, no uncopied bytes", + rpc->id, rpc->msgin.num_bpages); + } else { + struct homa_data_hdr *h; + + h = (struct homa_data_hdr *) skb->data; + tt_record3("RPC id %d has %d bpages allocated, first uncopied offset %d", + rpc->id, rpc->msgin.num_bpages, + homa_get_offset(h)); + } + } + } else if (rpc->state == RPC_OUTGOING) { + tt_record4("Outgoing RPC id %d, peer 0x%x, %d/%d bytes sent", + rpc->id, tt_addr(rpc->peer->addr), + rpc->msgout.next_xmit_offset, + rpc->msgout.length); +#ifndef __STRIP__ + if (rpc->msgout.granted > rpc->msgout.next_xmit_offset) + tt_record3("RPC id %d has %d unsent grants (granted %d)", + rpc->id, rpc->msgout.granted - + rpc->msgout.next_xmit_offset, + rpc->msgout.granted); +#endif /* __STRIP__ */ + } else { + tt_record2("RPC id %d is in state %d", rpc->id, rpc->state); + } +} + +/** + * homa_rpc_log_active_tt() - Log information about all active RPCs using + * timetraces. + * @homa: Overall data about the Homa protocol implementation. + * @freeze_count: If nonzero, FREEZE requests will be sent for this many + * incoming RPCs with outstanding grants + */ +void homa_rpc_log_active_tt(struct homa *homa, int freeze_count) +{ + struct homa_socktab_scan scan; + struct homa_sock *hsk; + struct homa_rpc *rpc; + int count = 0; + + tt_record("Logging Homa RPCs:"); + rcu_read_lock(); + for (hsk = homa_socktab_start_scan(homa->socktab, &scan); + hsk; hsk = homa_socktab_next(&scan)) { + if (list_empty(&hsk->active_rpcs) || hsk->shutdown) + continue; + + if (!homa_protect_rpcs(hsk)) + continue; + list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { + struct homa_freeze_hdr freeze; + + count++; + homa_rpc_log_tt(rpc); + if (freeze_count == 0) + continue; + if (rpc->state != RPC_INCOMING) + continue; +#ifndef __STRIP__ + if (rpc->msgin.granted <= (rpc->msgin.length + - rpc->msgin.bytes_remaining)) + continue; +#endif /* __STRIP__ */ + freeze_count--; + pr_notice("Emitting FREEZE in %s\n", __func__); + homa_xmit_control(FREEZE, &freeze, sizeof(freeze), rpc); + } + homa_unprotect_rpcs(hsk); + } + homa_socktab_end_scan(&scan); + rcu_read_unlock(); + tt_record1("Finished logging (%d active Homa RPCs)", count); +} + +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_validate_incoming() - Scan all of the active RPCs to compute what + * homa_total_incoming should be, and see if it actually matches. + * @homa: Overall data about the Homa protocol implementation. + * @verbose: Print incoming info for each individual RPC. + * @link_errors: Set to 1 if one or more grantable RPCs don't seem to + * be linked into the grantable lists. + * Return: The difference between the actual value of homa->total_incoming + * and the expected value computed from the individual RPCs (positive + * means homa->total_incoming is higher than expected). + */ +int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) +{ + struct homa_socktab_scan scan; + int total_incoming = 0; + struct homa_sock *hsk; + struct homa_rpc *rpc; + int actual; + + tt_record1("homa_validate_incoming starting, total_incoming %d", + atomic_read(&homa->grant->total_incoming)); + *link_errors = 0; + rcu_read_lock(); + for (hsk = homa_socktab_start_scan(homa->socktab, &scan); + hsk; hsk = homa_socktab_next(&scan)) { + if (list_empty(&hsk->active_rpcs) || hsk->shutdown) + continue; + + if (!homa_protect_rpcs(hsk)) + continue; + list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { + int incoming; + + if (rpc->state != RPC_INCOMING) + continue; + incoming = rpc->msgin.granted - + (rpc->msgin.length + - rpc->msgin.bytes_remaining); + if (incoming < 0) + incoming = 0; + if (rpc->msgin.rec_incoming == 0) + continue; + total_incoming += rpc->msgin.rec_incoming; + if (verbose) + tt_record3("homa_validate_incoming: RPC id %d, incoming %d, rec_incoming %d", + rpc->id, incoming, + rpc->msgin.rec_incoming); + if (rpc->msgin.granted >= rpc->msgin.length) + continue; + if (list_empty(&rpc->grantable_links)) { + tt_record1("homa_validate_incoming: RPC id %d not linked in grantable list", + rpc->id); + *link_errors = 1; + } + if (list_empty(&rpc->grantable_links)) { + tt_record1("homa_validate_incoming: RPC id %d peer not linked in grantable list", + rpc->id); + *link_errors = 1; + } + } + homa_unprotect_rpcs(hsk); + } + homa_socktab_end_scan(&scan); + rcu_read_unlock(); + actual = atomic_read(&homa->grant->total_incoming); + tt_record3("homa_validate_incoming diff %d (expected %d, got %d)", + actual - total_incoming, total_incoming, actual); + return actual - total_incoming; +} + +/** + * homa_drop_packet() - Invoked for each incoming packet to determine + * (stochastically) whether that packet should be dropped. Used during + * development to exercise retry code. + * to + * @homa: Overall information about the Homa transport + * Return: Nonzero means drop the packet, zero means process normally. + */ +int homa_drop_packet(struct homa *homa) +{ + /* This code is full of races, but they don't matter (better fast + * than precise). + */ + if (homa->accept_bits == 0) + return 0; + while (1) { + if (accept_count > 0) { + accept_count--; + return 0; + } + if (drop_count > 0) { + drop_count--; + return 1; + } + if (seed == 0) + seed = homa_clock(); + seed = seed * 1664525 + 1013904223; + accept_count = (seed >> 4) & ((1 << homa->accept_bits) - 1); + seed = seed * 1664525 + 1013904223; + drop_count = 1 + ((seed >> 4) & ((1 << homa->drop_bits) - 1)); + tt_record2("homa_drop_packet set accept_count %d, drop_count 0x%x", + accept_count, drop_count); + } +} +#endif /* See strip.py */ + +/** + * homa_snapshot_get_stats() - Fill in a homa_rpc_snapshot with the latest + * statistics. + * @snap: Structure to fill in. + */ +void homa_snapshot_get_stats(struct homa_rpc_snapshot *snap) +{ + IF_NO_STRIP(int core); + + memset(snap, 0, sizeof(*snap)); + snap->clock = homa_clock(); +#ifndef __STRIP__ /* See strip.py */ + for (core = 0; core < nr_cpu_ids; core++) { + struct homa_metrics *m = &per_cpu(homa_metrics, core); + + snap->client_requests_started += m->client_requests_started; + snap->client_request_bytes_started += + m->client_request_bytes_started; + snap->client_request_bytes_done += m->client_request_bytes_done; + snap->client_requests_done += m->client_requests_done; + + snap->client_responses_started += m->client_responses_started; + snap->client_response_bytes_started += + m->client_response_bytes_started; + snap->client_response_bytes_done += + m->client_response_bytes_done; + snap->client_responses_done += m->client_responses_done; + + snap->server_requests_started += m->server_requests_started; + snap->server_request_bytes_started += + m->server_request_bytes_started; + snap->server_request_bytes_done += m->server_request_bytes_done; + snap->server_requests_done += m->server_requests_done; + + snap->server_responses_started += m->server_responses_started; + snap->server_response_bytes_started += + m->server_response_bytes_started; + snap->server_response_bytes_done += + m->server_response_bytes_done; + snap->server_responses_done += m->server_responses_done; + } +#endif /* See strip.py */ +} + +/** + * homa_snapshot_rpcs() - This function is called by homa_timer; it collects + * data about overall progress of client and server RPCs. + */ +void homa_snapshot_rpcs(void) +{ + struct homa_rpc_snapshot *snap; + u64 now = homa_clock(); + + if (snapshot_interval == 0) + snapshot_interval = homa_clock_khz() * RX_SNAPSHOT_INTERVAL; + + if (now < snapshot_time + snapshot_interval) + return; + snapshot_time = now; + snap = &rpc_snapshots[next_snapshot]; + homa_snapshot_get_stats(snap); + next_snapshot++; + if (next_snapshot >= MAX_RX_SNAPSHOTS) + next_snapshot = 0; +} + +/** + * homa_rpc_snapshot_log_tt() - Dump all of the RPC snapshot data to the + * timetrace. + */ +void homa_rpc_snapshot_log_tt(void) +{ + u64 creq_base, creq_bbase, cresp_base, cresp_bbase; + u64 sreq_base, sreq_bbase, sresp_base, sresp_bbase; + struct homa_rpc_snapshot *snap; + u64 now = homa_clock(); + u64 usecs; + int i; + + i = next_snapshot; + + /* Offset all the output values to start at 0, in order to avoid + * wraparound in 32-bit timetrace values. + */ + creq_base = rpc_snapshots[i].client_requests_done; + creq_bbase = rpc_snapshots[i].client_request_bytes_done; + cresp_base = rpc_snapshots[i].client_responses_done; + cresp_bbase = rpc_snapshots[i].client_response_bytes_done; + sreq_base = rpc_snapshots[i].server_requests_done; + sreq_bbase = rpc_snapshots[i].server_request_bytes_done; + sresp_base = rpc_snapshots[i].server_responses_done; + sresp_bbase = rpc_snapshots[i].server_response_bytes_done; + do { + snap = &rpc_snapshots[i]; + + /* Compute how many microseconds before now this snapshot + * was taken. + */ + usecs = 1000*(now - snap->clock); + do_div(usecs, homa_clock_khz()); + + tt_record1("rpc snapshot usecs %d", -usecs); + tt_record4("rpc snapshot client requests started %d, kbytes_started %d, kbytes_done %d, done %d", + snap->client_requests_started - creq_base, + (snap->client_request_bytes_started - + creq_bbase) >> 10, + (snap->client_request_bytes_done - + creq_bbase) >> 10, + snap->client_requests_done - creq_base); + tt_record4("rpc snapshot client responses started %d, kbytes_started %d, kbytes_done %d, done %d", + snap->client_responses_started - cresp_base, + (snap->client_response_bytes_started - + cresp_bbase) >> 10, + (snap->client_response_bytes_done - + cresp_bbase) >> 10, + snap->client_responses_done - cresp_base); + tt_record4("rpc snapshot server requests started %d, kbytes_started %d, kbytes_done %d, done %d", + snap->server_requests_started - sreq_base, + (snap->server_request_bytes_started - + sreq_bbase) >> 10, + (snap->server_request_bytes_done - + sreq_bbase) >> 10, + snap->server_requests_done - sreq_base); + tt_record4("rpc snapshot server responses started %d, kbytes_started %d, kbytes_done %d, done %d", + snap->server_responses_started - sresp_base, + (snap->server_response_bytes_started - + sresp_bbase) >> 10, + (snap->server_response_bytes_done - + sresp_bbase) >> 10, + snap->server_responses_done - sresp_base); + + i++; + if (i >= MAX_RX_SNAPSHOTS) + i = 0; + } while (i != next_snapshot); +} + +/** + * homa_rpc_stats_log() - Print statistics on RPC progress to the system log. + */ +void homa_rpc_stats_log(void) +{ + struct homa_rpc_snapshot snap; + + homa_snapshot_get_stats(&snap); + pr_notice("Client requests: started %llu, done %llu, delta %llu\n", + snap.client_requests_started, snap.client_requests_done, + snap.client_requests_started - snap.client_requests_done); + pr_notice("Client request bytes: started %llu, bytes_done %llu, delta %llu\n", + snap.client_request_bytes_started, + snap.client_request_bytes_done, + snap.client_request_bytes_started - + snap.client_request_bytes_done); + pr_notice("Client responses: started %llu, done %llu, delta %llu\n", + snap.client_responses_started, snap.client_responses_done, + snap.client_responses_started - snap.client_responses_done); + pr_notice("Client response bytes: started %llu, bytes_done %llu, delta %llu\n", + snap.client_response_bytes_started, + snap.client_response_bytes_done, + snap.client_response_bytes_started - + snap.client_response_bytes_done); + pr_notice("Server requests: started %llu, done %llu, delta %llu\n", + snap.server_requests_started, snap.server_requests_done, + snap.server_requests_started - snap.server_requests_done); + pr_notice("Server request bytes: started %llu, bytes_done %llu, delta %llu\n", + snap.server_request_bytes_started, + snap.server_request_bytes_done, + snap.server_request_bytes_started - + snap.server_request_bytes_done); + pr_notice("Server responses: started %llu, done %llu, delta %llu\n", + snap.server_responses_started, snap.server_responses_done, + snap.server_responses_started - snap.server_responses_done); + pr_notice("Server response bytes: started %llu, bytes_done %llu, delta %llu\n", + snap.server_response_bytes_started, + snap.server_response_bytes_done, + snap.server_response_bytes_started - + snap.server_response_bytes_done); +} + +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_rpcs_deferred() - Return true if there are any RPCs with packets + * that have been deferred by homa_qdisc, false if there are none. + * @homa: Overall information about the Homa protocol. + * Return: See above. + */ +bool homa_rpcs_deferred(struct homa *homa) +{ + struct homa_qdisc_shared *qshared = homa->qshared; + struct homa_qdisc_dev *qdev; + bool result = false; + + rcu_read_lock(); + list_for_each_entry_rcu(qdev, &qshared->qdevs, links) { + if (homa_qdisc_any_deferred(qdev)) { + result = true; + break; + } + } + rcu_read_unlock(); + return result; +} + +/** + * homa_validate_rbtree() - Scan the structure of a red-black tree and + * abort the kernel (dumping the timetrace) if the internal structure + * does not satisfy the required invariants. + * @node: Node whose subtree should be scanned. + * @depth: Depth of node (number of black nodes above this node, 0 for + * root). + * @message: Textual message identifying the point where this function + * was invoked (used when reporting errors). + */ +void homa_validate_rbtree(struct rb_node *node, int depth, char *message) +{ + struct homa_rpc *rpc, *child_rpc; + struct rb_node *child; + static int max_depth; + int black, new_depth; + + if (depth == 0) { + if (!node) + return; + if (!rb_is_black(node)) { +#ifdef __UNIT_TEST__ + FAIL("rbtree root is red"); +#else + tt_record("freezing because rbtree root is red"); +#endif /* __UNIT_TEST__ */ + goto error; + } + max_depth = -1; + } + + rpc = container_of(node, struct homa_rpc, qrpc.rb_node); + if (rpc->magic != HOMA_RPC_MAGIC) { +#ifdef __UNIT_TEST__ + FAIL("rpc id %llu (0x%px) in rbtree has bad magic 0x%x", + rpc->id, rpc, rpc->magic); +#else + tt_record4("freezing because rpc id %d (0x%x%08x) in rbtree has bad magic 0x%x", + rpc->id, tt_hi(rpc), tt_lo(rpc), rpc->magic); +#endif /* __UNIT_TEST__ */ + goto error; + + } + + black = rb_is_black(node); + new_depth = depth + black; + if (!node->rb_left || !node->rb_right) { + if (max_depth < 0) { + max_depth = new_depth; + } else if (max_depth != new_depth) { +#ifdef __UNIT_TEST__ + FAIL("inconsistent rbtree depths: %d and %d", + max_depth, new_depth); +#else + tt_record2("freezing because of inconsistent rbtree depths: %d and %d", + max_depth, depth); +#endif /* __UNIT_TEST__ */ + goto error; + } + goto done; + } + + child = node->rb_left; + if (child) { + child_rpc = container_of(child, struct homa_rpc, qrpc.rb_node); + if (__rb_parent(child->__rb_parent_color) != node) { +#ifdef __UNIT_TEST__ + FAIL("rbtree left child has bad parent, rpc id %llu", + child_rpc->id); +#else + tt_record1("freezing because rbtree left child has bad parent, rpc id %llu", + child_rpc->id); +#endif /* __UNIT_TEST__ */ + goto error; + } + if (!black && !rb_is_black(child)) { +#ifdef __UNIT_TEST__ + FAIL("rbtree red parent has red left child"); +#else + tt_record("rbtree red parent has red left child"); +#endif /* __UNIT_TEST__ */ + goto error; + } + if (!homa_qdisc_precedes(child_rpc, rpc)) { +#ifdef __UNIT_TEST__ + FAIL("rbtree contained out-of-order left child"); +#else + tt_record("freezing because rbtree contained out-of-order left child"); +#endif /* __UNIT_TEST__ */ + goto error; + } + homa_validate_rbtree(child, depth + black, message); + } + + child = node->rb_right; + if (child) { + if (__rb_parent(child->__rb_parent_color) != node) { +#ifdef __UNIT_TEST__ + FAIL("rbtree right child has bad parent, rpc id %llu", + child_rpc->id); +#else + tt_record1("freezing because rbtree right child has bad parent, rpc id %llu", + child_rpc->id); +#endif /* __UNIT_TEST__ */ + goto error; + } + if (!black && !rb_is_black(child)) { +#ifdef __UNIT_TEST__ + FAIL("rbtree red parent has red right child"); +#else + tt_record("freezing because rbtree red parent has red right child"); +#endif /* __UNIT_TEST__ */ + goto error; + } + child_rpc = container_of(child, struct homa_rpc, qrpc.rb_node); + if (!homa_qdisc_precedes(rpc, child_rpc)) { +#ifdef __UNIT_TEST__ + FAIL("rbtree contained out-of-order right child"); +#else + tt_record("freezing because rbtree rbtree contained out-of-order right child"); +#endif /* __UNIT_TEST__ */ + goto error; + } + homa_validate_rbtree(child, depth + black, message); + } + +done: + return; + +error: +#ifndef __UNIT_TEST__ + tt_record(message); + if (!atomic_read(&tt_frozen)) { + tt_freeze(); + pr_err("rbtree consistency error at %s\n", message); + tt_printk(); + BUG_ON(1); + } +#endif /* __UNIT_TEST__ */ +} +#endif /* See strip.py */ diff --git a/homa_devel.h b/homa_devel.h new file mode 100644 index 00000000..3db08a36 --- /dev/null +++ b/homa_devel.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ + +/* This file defines functions that are useful during Homa development; + * they are not present in the upstreamed version of Homa in Linux. + */ + +#ifndef _HOMA_DEVEL_H +#define _HOMA_DEVEL_H + +#ifdef __UNIT_TEST__ +#ifndef __NO_KSELFTEST__ +#define KSELFTEST_NOT_MAIN 1 +#include "kselftest_harness.h" +#endif /* __NO_KSELFTEST__ */ +#endif /* __UNIT_TEST__ */ + +#include "timetrace.h" + +#ifdef __STRIP__ +#define INC_METRIC(...) + +#undef LINUX_VERSION_CODE +#define LINUX_VERSION_CODE 100 + +#undef KERNEL_VERSION +#define KERNEL_VERSION(...) 100 +#endif /* __STRIP__ */ + +struct homa; +struct homa_net; +struct homa_rpc; + +/** + * enum homa_freeze_type - The @type argument to homa_freeze must be + * one of these values. + */ +enum homa_freeze_type { + RESTART_RPC = 1, + PEER_TIMEOUT = 2, + SLOW_RPC = 3, + PACKET_LOST = 4, + NEED_ACK_MISSING_DATA = 5, +}; + +/** + * struct homa_rpc_snapshot - Captures the state of RPCs (both client and + * server) on a node at a given point in time. + */ +struct homa_rpc_snapshot { + /** @clock: homa_clock() value when data was gathered. */ + u64 clock; + + /* Each value below is the sum (across all cores) of the metric with + * the same name. + */ + u64 client_requests_started; + u64 client_request_bytes_started; + u64 client_request_bytes_done; + u64 client_requests_done; + + u64 client_responses_started; + u64 client_response_bytes_started; + u64 client_response_bytes_done; + u64 client_responses_done; + + u64 server_requests_started; + u64 server_request_bytes_started; + u64 server_request_bytes_done; + u64 server_requests_done; + + u64 server_responses_started; + u64 server_response_bytes_started; + u64 server_response_bytes_done; + u64 server_responses_done; +}; + +/** + * tt_addr() - Given an address, return a 4-byte id that will (hopefully) + * provide a unique identifier for the address in a timetrace record. + * @x: Address (either IPv6 or IPv4-mapped IPv6) + * Return: see above + */ +static inline u32 tt_addr(const struct in6_addr x) +{ + return ipv6_addr_v4mapped(&x) ? ntohl(x.in6_u.u6_addr32[3]) + : (x.in6_u.u6_addr32[3] ? ntohl(x.in6_u.u6_addr32[3]) + : ntohl(x.in6_u.u6_addr32[1])); +} + +static inline void check_addr_valid(void *addr, char *info) +{ +#ifndef __UNIT_TEST__ +#define HIGH_BITS 0xffff800000000000 + u64 int_addr = (u64)addr; + + if ((int_addr & HIGH_BITS) != HIGH_BITS) { + pr_err("Bogus address 0x%px (%s))\n", addr, info); + tt_record("Freezing timetrace because of bogus address"); + tt_record(info); + tt_freeze(); + tt_printk(); + pr_err("Finished dumping timetrace\n"); + BUG_ON(1); + } +#endif /* __UNIT_TEST__ */ +} + +#ifndef __STRIP__ /* See strip.py */ +#define IF_NO_STRIP(code) code +#else /* See strip.py */ +#define IF_NO_STRIP(...) +#endif /* See strip.py */ + +void homa_check_addr(void *p); +void homa_check_list(struct list_head *list, int max_length); +int homa_drop_packet(struct homa *homa); +void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, + char *format); +void homa_freeze_peers(void); +char *homa_print_ipv4_addr(__be32 addr); +char *homa_print_ipv6_addr(const struct in6_addr *addr); +char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len); +char *homa_print_packet_short(struct sk_buff *skb, char *buffer, + int buf_len); +void homa_rpc_log(struct homa_rpc *rpc); +void homa_rpc_log_active(struct homa *homa, uint64_t id); +void homa_rpc_log_tt(struct homa_rpc *rpc); +void homa_rpc_log_active_tt(struct homa *homa, int freeze_count); +void homa_rpc_snapshot_log_tt(void); +void homa_rpc_stats_log(void); +void homa_snapshot_get_stats(struct homa_rpc_snapshot *snap); +void homa_snapshot_rpcs(void); +int homa_snprintf(char *buffer, int size, int used, + const char *format, ...) __printf(4, 5); +char *homa_symbol_for_type(uint8_t type); +char *homa_symbol_for_state(struct homa_rpc *rpc); +int homa_validate_incoming(struct homa *homa, int verbose, + int *link_errors); + +#ifndef __STRIP__ /* See strip.py */ +bool homa_rpcs_deferred(struct homa *homa); +void homa_validate_rbtree(struct rb_node *node, int depth, char *message); +#endif /* See strip.py */ + +#endif /* _HOMA_DEVEL_H */ diff --git a/homa_grant.c b/homa_grant.c new file mode 100644 index 00000000..1ea24a8d --- /dev/null +++ b/homa_grant.c @@ -0,0 +1,1181 @@ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +/* This file contains functions related to issuing grants for incoming + * messages. + */ + +#include "homa_impl.h" +#include "homa_grant.h" +#include "homa_pacer.h" +#include "homa_peer.h" +#include "homa_rpc.h" +#include "homa_wire.h" + +#ifndef __STRIP__ /* See strip.py */ +/* Used to enable sysctl access to grant-specific configuration parameters. The + * @data fields are actually offsets within a struct homa_grant; these are + * converted to pointers into a net-specific struct grant later. + */ +#define OFFSET(field) ((void *)offsetof(struct homa_grant, field)) +static struct ctl_table grant_ctl_table[] = { + { + .procname = "fifo_grant_increment", + .data = OFFSET(fifo_grant_increment), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_grant_dointvec + }, + { + .procname = "grant_fifo_fraction", + .data = OFFSET(fifo_fraction), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_grant_dointvec + }, + { + .procname = "grant_recalc_usecs", + .data = OFFSET(recalc_usecs), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_grant_dointvec + }, + { + .procname = "max_grantable_rpcs", + .data = OFFSET(max_grantable_rpcs), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_grant_dointvec + }, + { + .procname = "max_incoming", + .data = OFFSET(max_incoming), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_grant_dointvec + }, + { + .procname = "max_overcommit", + .data = OFFSET(max_overcommit), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_grant_dointvec + }, + { + .procname = "max_rpcs_per_peer", + .data = OFFSET(max_rpcs_per_peer), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_grant_dointvec + }, + { + .procname = "window", + .data = OFFSET(window_param), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_grant_dointvec + }, +}; +#endif /* See strip.py */ + +/** + * homa_grant_alloc() - Allocate and initialize a new grant object, which + * will hold grant management information for @homa. + * @homa: The struct homa that the new object is associated with. + * Return: A pointer to the new struct grant, or a negative errno. + */ +struct homa_grant *homa_grant_alloc(struct homa *homa) +{ + struct homa_grant *grant; + int err; + + grant = kzalloc(sizeof(*grant), GFP_KERNEL); + if (!grant) + return ERR_PTR(-ENOMEM); + grant->homa = homa; + atomic_set(&grant->stalled_rank, INT_MAX); + grant->max_incoming = 400000; + spin_lock_init(&grant->lock); + INIT_LIST_HEAD(&grant->grantable_peers); + grant->window_param = 10000; + grant->max_rpcs_per_peer = 1; + grant->max_overcommit = 8; + grant->recalc_usecs = 20; + grant->fifo_grant_increment = 50000; + grant->fifo_fraction = 50; + +#ifndef __STRIP__ /* See strip.py */ + grant->sysctl_header = register_net_sysctl(&init_net, "net/homa", + grant_ctl_table); + if (!grant->sysctl_header) { + err = -ENOMEM; + pr_err("couldn't register sysctl parameters for Homa grants\n"); + goto error; + } +#endif /* See strip.py */ + homa_grant_update_sysctl_deps(grant); + grant->next_recalc = homa_clock() + grant->recalc_cycles; + return grant; + +error: + homa_grant_free(grant); + return ERR_PTR(err); +} + +/** + * homa_grant_free() - Cleanup and free the grant object for a Homa + * transport. + * @grant: Object to free; caller must not reference the object + * again once this function returns. + */ +void homa_grant_free(struct homa_grant *grant) +{ +#ifndef __STRIP__ /* See strip.py */ + if (grant->sysctl_header) { + unregister_net_sysctl_table(grant->sysctl_header); + grant->sysctl_header = NULL; + } +#endif /* See strip.py */ + kfree(grant); +} + +/** + * homa_grant_init_rpc() - Initialize grant-related information for an + * RPC's incoming message (may add the RPC to grant priority queues). + * @rpc: RPC being initialized. Grant-related fields in msgin + * are assumed to be zero. Must be locked by caller. + * @unsched: Number of unscheduled bytes in the incoming message for @rpc. + */ +void homa_grant_init_rpc(struct homa_rpc *rpc, int unsched) + __must_hold(rpc->bucket->lock) +{ + rpc->msgin.rank = -1; + if (unsched >= rpc->msgin.length) { + rpc->msgin.granted = rpc->msgin.length; + rpc->msgin.prev_grant = rpc->msgin.granted; + return; + } + rpc->msgin.granted = unsched; + rpc->msgin.prev_grant = unsched; + if (rpc->msgin.num_bpages != 0) + /* Can't issue grants unless buffer space has been allocated + * for the message. + */ + homa_grant_manage_rpc(rpc); +} + +/** + * homa_grant_end_rpc() - This function is invoked when homa_rpc_end is + * invoked; it cleans up any state related to grants for that RPC's + * incoming message. + * @rpc: The RPC to clean up. Must be locked by the caller. This function + * may release and then reacquire the lock. + */ +void homa_grant_end_rpc(struct homa_rpc *rpc) + __must_hold(rpc->bucket->lock) +{ + struct homa_grant *grant = rpc->hsk->homa->grant; + struct homa_grant_candidates cand; + + if (rpc->msgin.granted < rpc->msgin.length) { + homa_grant_cand_init(&cand); + homa_grant_unmanage_rpc(rpc, &cand); + if (!homa_grant_cand_empty(&cand)) { + homa_rpc_unlock(rpc); + homa_grant_cand_check(&cand, grant); + homa_rpc_lock(rpc); + } + } + + if (rpc->msgin.rec_incoming != 0) { + atomic_sub(rpc->msgin.rec_incoming, &grant->total_incoming); + rpc->msgin.rec_incoming = 0; + } +} + +/** + * homa_grant_window() - Return the window size (maximum number of granted + * but not received bytes for a message) given current conditions. + * @grant: Overall information for grant management. + * Return: See above. + */ +int homa_grant_window(struct homa_grant *grant) +{ + u64 window; + + window = grant->window_param; + if (window == 0) { + /* Dynamic window sizing uses the approach described in the + * paper "Dynamic Queue Length Thresholds for Shared-Memory + * Packet Switches" with an alpha value of 1. The idea is to + * maintain unused incoming capacity (for new RPC arrivals) + * equal to the amount of incoming allocated to each of the + * current RPCs. + */ + window = grant->max_incoming; + do_div(window, grant->num_active_rpcs + 1); + } + return window; +} + +/** + * homa_grant_outranks() - Returns nonzero if rpc1 should be considered + * higher priority for grants than rpc2, and zero if the two RPCS are + * equivalent or rpc2 is higher priority. + * @rpc1: First RPC to consider. + * @rpc2: Second RPC to consider. + * Return: see above + */ +int homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2) +{ + /* Fewest ungranted bytes is the primary criterion; if those are + * equal, then favor the older RPC. + */ + int grant_diff; + + grant_diff = (rpc1->msgin.length - rpc1->msgin.granted) - + (rpc2->msgin.length - rpc2->msgin.granted); + return grant_diff < 0 || ((grant_diff == 0) && + (rpc1->msgin.birth < rpc2->msgin.birth)); +} + +/** + * homa_grant_priority() - Return the appropriate priority to use in a + * grant for an incoming message. + * @homa: Overall information about the Homa transport. + * @rank: Position of the message's RPC in active_rpcs (lower means + * higher priority). + * Return: See above. + */ +int homa_grant_priority(struct homa *homa, int rank) +{ + int max_sched_prio, extra_levels, priority; + + /* If there aren't enough active RPCs to consume all of the priority + * levels, use only the lower levels; this allows faster preemption + * if a new high-priority message appears. + */ + max_sched_prio = homa->max_sched_prio; + priority = max_sched_prio - rank; + extra_levels = max_sched_prio + 1 - homa->grant->num_active_rpcs; + if (extra_levels >= 0) + priority -= extra_levels; + return (priority < 0) ? 0 : priority; +} + +/** + * homa_grant_insert_active() - Try to insert an RPC in homa->active_rpcs. + * @rpc: RPC to insert (if possible). + * Return: NULL if there was room to insert @rpc without ejecting any other + * RPC. Otherwise, returns an RPC that must be added to + * homa->grantable_peers (could be either @rpc or some other RPC + * that @rpc displaced). + */ +struct homa_rpc *homa_grant_insert_active(struct homa_rpc *rpc) + __must_hold(rpc->hsk->homa->grant->lock) +{ + struct homa_grant *grant = rpc->hsk->homa->grant; + struct homa_rpc *other, *result; + int insert_after; + int last_to_copy; + int peer_index; + int i; + + /* Scan active_rpcs backwards to find the lowest-priority message + * with higher priority than @rpc. Also find the lowest-priority + * message with the same peer as @rpc, if one appears. + */ + insert_after = -1; + peer_index = -1; + for (i = grant->num_active_rpcs - 1; i >= 0; i--) { + other = grant->active_rpcs[i]; + if (!homa_grant_outranks(rpc, other)) { + insert_after = i; + break; + } + if (peer_index < 0 && other->peer == rpc->peer) + peer_index = i; + } + + if (rpc->peer->active_rpcs >= grant->max_rpcs_per_peer) { + if (peer_index <= i) + /* All the other RPCs with the same peer are higher + * priority than @rpc and we can't have any more RPCs + * with the same peer, so bump @rpc. + */ + return rpc; + + /* Bump the lowest priority RPC from the same peer to make room + * for the new RPC. @rpc will be in a slot with lower index + * (higher priority) than the bumped one. + */ + result = grant->active_rpcs[peer_index]; + result->msgin.rank = -1; + result->peer->active_rpcs--; + last_to_copy = peer_index - 1; + } else { + if (insert_after >= grant->max_overcommit - 1) + /* active_rpcs is full and @rpc is too low priority; + * bump it. + */ + return rpc; + + if (grant->num_active_rpcs >= grant->max_overcommit) { + result = grant->active_rpcs[grant->num_active_rpcs - 1]; + result->msgin.rank = -1; + result->peer->active_rpcs--; + last_to_copy = grant->num_active_rpcs - 2; + } else { + result = NULL; + last_to_copy = grant->num_active_rpcs - 1; + grant->num_active_rpcs++; + } + } + + /* Move existing RPCs in active_rpcs down to make room for @rpc. */ + for (i = last_to_copy; i > insert_after; i--) { + other = grant->active_rpcs[i]; + other->msgin.rank = i + 1; + grant->active_rpcs[i + 1] = other; + } + grant->active_rpcs[insert_after + 1] = rpc; + rpc->msgin.rank = insert_after + 1; + rpc->peer->active_rpcs++; + + return result; +} + +/** + * homa_grant_adjust_peer() - This function is invoked when the contents + * of a peer's grantable_rpcs list has changed, so it's possible that + * the position of this peer in grantable_peers is no longer correct. The + * function adjusts the position of peer in grantable_peers (which could + * include adding or removing the peer to/from grantable_peers). + * @grant: Overall information about grants + * @peer: Peer to adjust + */ +void homa_grant_adjust_peer(struct homa_grant *grant, struct homa_peer *peer) + __must_hold(&grant->lock) +{ + struct homa_rpc *head, *other_rpc; + struct homa_peer *other_peer; + + if (list_empty(&peer->grantable_rpcs)) { + list_del_init(&peer->grantable_links); + return; + } + + head = list_first_entry(&peer->grantable_rpcs, + struct homa_rpc, grantable_links); + if (list_empty(&peer->grantable_links)) { + /* Must add peer to grantable_peers. */ + list_for_each_entry(other_peer, &grant->grantable_peers, + grantable_links) { + other_rpc = list_first_entry(&other_peer->grantable_rpcs, + struct homa_rpc, + grantable_links); + if (homa_grant_outranks(head, other_rpc)) { + list_add_tail(&peer->grantable_links, + &other_peer->grantable_links); + return; + } + } + list_add_tail(&peer->grantable_links, &grant->grantable_peers); + return; + } + + /* The peer is on grantable_peers; this loop moves it upward, if + * needed. + */ + while (peer != list_first_entry(&grant->grantable_peers, + struct homa_peer, grantable_links)) { + other_peer = list_prev_entry(peer, grantable_links); + other_rpc = list_first_entry(&other_peer->grantable_rpcs, + struct homa_rpc, grantable_links); + if (!homa_grant_outranks(head, other_rpc)) + break; + __list_del_entry(&other_peer->grantable_links); + list_add(&other_peer->grantable_links, &peer->grantable_links); + } + + /* This loop moves the peer downward in grantable_peers, if needed. */ + while (peer != list_last_entry(&grant->grantable_peers, + struct homa_peer, grantable_links)) { + other_peer = list_next_entry(peer, grantable_links); + other_rpc = list_first_entry(&other_peer->grantable_rpcs, + struct homa_rpc, grantable_links); + if (!homa_grant_outranks(other_rpc, head)) + break; + __list_del_entry(&peer->grantable_links); + list_add(&peer->grantable_links, &other_peer->grantable_links); + } +} + +/** + * homa_grant_insert_grantable() - Insert an RPC into the grantable list + * for its peer. + * @rpc: The RPC to add. Must not currently be in either active_rpcs + * or grantable_peers. + */ +void homa_grant_insert_grantable(struct homa_rpc *rpc) + __must_hold(rpc->hsk->homa->grant->lock) +{ + struct homa_grant *grant = rpc->hsk->homa->grant; + struct homa_peer *peer = rpc->peer; + struct homa_rpc *other; + + /* Insert @rpc in the right place in the grantable_rpcs list for + * its peer. + */ + list_for_each_entry(other, &peer->grantable_rpcs, grantable_links) { + if (homa_grant_outranks(rpc, other)) { + list_add_tail(&rpc->grantable_links, + &other->grantable_links); + goto position_peer; + } + } + list_add_tail(&rpc->grantable_links, &peer->grantable_rpcs); + +position_peer: + homa_grant_adjust_peer(grant, peer); +} + +/** + * homa_grant_manage_rpc() - Insert an RPC into the priority-based data + * structures for managing grantable RPCs (active_rpcs or grantable_peers). + * Ensures that the RPC will be sent grants as needed. + * @rpc: The RPC to add. Must be locked by caller. + */ +void homa_grant_manage_rpc(struct homa_rpc *rpc) + __must_hold(rpc->bucket->lock) +{ + struct homa_grant *grant = rpc->hsk->homa->grant; + struct homa_rpc *bumped; + u64 time = homa_clock(); + + BUG_ON(rpc->msgin.rank >= 0 || !list_empty(&rpc->grantable_links)); + + homa_grant_lock(grant); + + INC_METRIC(grantable_rpcs_integral, grant->num_grantable_rpcs * + (time - grant->last_grantable_change)); + grant->last_grantable_change = time; + grant->num_grantable_rpcs++; + tt_record2("Incremented num_grantable_rpcs to %d, id %d", + grant->num_grantable_rpcs, rpc->id); + if (grant->num_grantable_rpcs > grant->max_grantable_rpcs) + grant->max_grantable_rpcs = grant->num_grantable_rpcs; + + bumped = homa_grant_insert_active(rpc); + if (bumped) + homa_grant_insert_grantable(bumped); + grant->window = homa_grant_window(grant); + + homa_grant_unlock(grant); +} + +/** + * homa_grant_remove_grantable() - Unlink an RPC from the grantable lists, + * so it will no longer be considered for grants. + * @rpc: RPC to remove from grantable lists. Must currently be in + * a grantable list. + */ +void homa_grant_remove_grantable(struct homa_rpc *rpc) + __must_hold(rpc->hsk->homa->grant->lock) +{ + struct homa_peer *peer = rpc->peer; + struct homa_rpc *head; + + head = list_first_entry(&peer->grantable_rpcs, + struct homa_rpc, grantable_links); + list_del_init(&rpc->grantable_links); + if (rpc == head) + homa_grant_adjust_peer(rpc->hsk->homa->grant, peer); +} + +/** + * homa_grant_remove_active() - Remove an RPC from active_rpcs and promote + * an RPC from grantable_peers if possible. + * @rpc: RPC that no longer needs grants. Must have rank > 0. + * @cand: If an RPC is promoted into active_rpcs it is added here. + */ +void homa_grant_remove_active(struct homa_rpc *rpc, + struct homa_grant_candidates *cand) + __must_hold(rpc->hsk->homa->grant->lock) +{ + struct homa_grant *grant = rpc->hsk->homa->grant; + struct homa_peer *peer; + struct homa_rpc *other; + int i; + + for (i = rpc->msgin.rank + 1; i < grant->num_active_rpcs; i++) { + other = grant->active_rpcs[i]; + other->msgin.rank = i - 1; + grant->active_rpcs[i - 1] = other; + } + rpc->msgin.rank = -1; + rpc->peer->active_rpcs--; + grant->num_active_rpcs--; + grant->active_rpcs[grant->num_active_rpcs] = NULL; + + /* Pull the highest-priority entry (if there is one) from + * grantable_peers into active_rpcs. + */ + list_for_each_entry(peer, &grant->grantable_peers, grantable_links) { + if (peer->active_rpcs >= grant->max_rpcs_per_peer) + continue; + other = list_first_entry(&peer->grantable_rpcs, + struct homa_rpc, + grantable_links); + homa_grant_remove_grantable(other); + peer->active_rpcs++; + grant->active_rpcs[grant->num_active_rpcs] = other; + other->msgin.rank = grant->num_active_rpcs; + grant->num_active_rpcs++; + homa_grant_cand_add(cand, other); + break; + } +} + +/** + * homa_grant_unmanage_rpc() - Make sure that an RPC is no longer present + * in the priority structures used to manage grants (active_rpcs and + * grantable_rpcs). The RPC will no longer receive grants. + * @rpc: RPC to unlink. + * @cand: If an RPC is promoted into active_rpcs, it is added here. + */ +void homa_grant_unmanage_rpc(struct homa_rpc *rpc, + struct homa_grant_candidates *cand) + __must_hold(rpc->bucket->lock) +{ + struct homa_grant *grant = rpc->hsk->homa->grant; + u64 time = homa_clock(); + bool removed = false; + + homa_grant_lock(grant); + + if (rpc->msgin.rank >= 0) { + homa_grant_remove_active(rpc, cand); + removed = true; + } + if (!list_empty(&rpc->grantable_links)) { + homa_grant_remove_grantable(rpc); + removed = true; + } + if (removed) { + INC_METRIC(grantable_rpcs_integral, grant->num_grantable_rpcs + * (time - grant->last_grantable_change)); + grant->last_grantable_change = time; + grant->num_grantable_rpcs--; + tt_record2("Decremented num_grantable_rpcs to %d, id %d", + grant->num_grantable_rpcs, rpc->id); + grant->window = homa_grant_window(grant); + } + if (rpc == grant->oldest_rpc) { + homa_rpc_put(rpc); + grant->oldest_rpc = NULL; + } + + homa_grant_unlock(grant); +} + +/** + * homa_grant_update_incoming() - Figure out how much incoming data there is + * for an RPC (i.e., data that has been granted but not yet received) and make + * sure this is properly reflected in rpc->msgin.incoming + * and homa->total_incoming. + * @rpc: RPC to check; must be locked. + * @grant: Grant information for a Homa transport. + */ +void homa_grant_update_incoming(struct homa_rpc *rpc, struct homa_grant *grant) + __must_hold(rpc->bucket->lock) +{ + int incoming, delta; + + incoming = rpc->msgin.granted - (rpc->msgin.length - + rpc->msgin.bytes_remaining); + if (incoming < 0) + incoming = 0; + delta = incoming - rpc->msgin.rec_incoming; + if (delta != 0) + atomic_add(delta, &grant->total_incoming); + rpc->msgin.rec_incoming = incoming; +} + +/** + * homa_grant_update_granted() - Compute a new grant offset for an RPC. + * @rpc: RPC whose msgin.granted should be updated. Must be locked by + * caller. + * @grant: Information for managing grants. This function may set + * incoming_hit_limit. + * Return: >= 0 means the offset was increased and a grant should be + * sent for the RPC; the return value gives the priority to + * use in the grant. -1 means the grant offset was not changed + * and no grant should be sent. + */ +int homa_grant_update_granted(struct homa_rpc *rpc, struct homa_grant *grant) + __must_hold(rpc->bucket->lock) +{ + int received, new_grant_offset, incoming_delta, avl_incoming, rank; + int prev_stalled; + + /* Don't increase the grant if the node has been slow to send + * data already granted: no point in wasting grants on this + * node. + */ + if (rpc->silent_ticks > 1) + return -1; + rank = READ_ONCE(rpc->msgin.rank); + if (rank < 0 || rpc->msgin.granted >= rpc->msgin.length) + return -1; + + received = rpc->msgin.length - rpc->msgin.bytes_remaining; + new_grant_offset = received + grant->window; + if (new_grant_offset > rpc->msgin.length) + new_grant_offset = rpc->msgin.length; + incoming_delta = new_grant_offset - received - rpc->msgin.rec_incoming; + avl_incoming = grant->max_incoming - atomic_read(&grant->total_incoming); + if (avl_incoming < incoming_delta) { + tt_record4("insufficient headroom for grant for RPC id %d (rank %d): desired increment %d, available %d", + rpc->id, rank, incoming_delta, avl_incoming); + prev_stalled = atomic_read(&grant->stalled_rank); + while (prev_stalled > rank) + prev_stalled = atomic_cmpxchg(&grant->stalled_rank, + prev_stalled, rank); + new_grant_offset -= incoming_delta - avl_incoming; + } + if (new_grant_offset <= rpc->msgin.granted) + return -1; + rpc->msgin.granted = new_grant_offset; + + /* The reason we compute the priority here rather than, say, in + * homa_grant_send is that rpc->msgin.rank could change to -1 + * before homa_grant_send is invoked (it could change at any time, + * since we don't have homa->grant->lock; that's why READ_ONCE + * is used above). It's OK to still send a grant in that case, but + * we need to have a meaningful priority level for it. + */ + return homa_grant_priority(rpc->hsk->homa, rank); +} + +/** + * homa_grant_send() - Issue a GRANT packet for the current grant offset + * of an incoming RPC. + * @rpc: RPC for which to issue GRANT. Should not be locked (to + * minimize lock contention, since sending a packet is slow), + * but caller must hold a reference to keep it from being reaped. + * The msgin.resend_all field will be cleared. + * @priority: Priority level to use for the grant. + */ +void homa_grant_send(struct homa_rpc *rpc, int priority) +{ + struct homa_grant_hdr grant; + + grant.offset = htonl(rpc->msgin.granted); + grant.priority = priority; + tt_record4("sending grant for id %d, offset %d, priority %d, increment %d", + rpc->id, rpc->msgin.granted, grant.priority, + rpc->msgin.granted - rpc->msgin.prev_grant); + rpc->msgin.prev_grant = rpc->msgin.granted; + homa_xmit_control(GRANT, &grant, sizeof(grant), rpc); +} + +/** + * homa_grant_check_rpc() - This function is responsible for generating + * grant packets. Is invoked whenever a data packet arrives for RPC; it + * checks the state of that RPC (as well as other RPCs) and generates + * grant packets as appropriate. + * @rpc: RPC to check. Must be locked by the caller. + */ +void homa_grant_check_rpc(struct homa_rpc *rpc) + __must_hold(rpc->bucket->lock) +{ + struct homa_grant *grant = rpc->hsk->homa->grant; + int needy_rank, stalled_rank, rank; + struct homa_grant_candidates cand; + int locked = 0; + u64 now; + int i; + + /* The challenge for this function is to minimize use of the grant + * lock, since that is global. Early versions of Homa acquired the + * grant lock on every call to this function, but that resulted in + * too much contention for the grant lock (especially at network + * speeds of 100 Gbps or more). + * + * This implementation is designed in the hopes that most calls can + * follow a fast path that does not require the grant lock: just + * update grant state for @rpc and possibly issue a new grant for + * @rpc, without considering other RPCs. + * + * However, there are some situations where other RPCs must be + * considered: + * 1. If there are higher-priority RPCs that are stalled (they would + * like to issue grants but could not because @total_incoming + * was exceeded), then they must get first shot at any headroom + * that has become available. + * 2. The priority order of RPCs could change, if data packets arrive + * for lower priority RPCs but not for higher priority ones. + * Rather than checking every time data arrives (which would + * require the grant lock), we recheck the priorities at regular + * time intervals. + * 3. Occasionally we need to send grants to the oldest message (FIFO + * priority) in order to prevent starvation. + * + * Each of these situations requires the grant lock. + */ + + if (rpc->msgin.length < 0 || rpc->msgin.num_bpages <= 0 || + rpc->state == RPC_DEAD) + return; + + tt_record4("homa_grant_check_rpc starting for id %d, granted %d, recv_end %d, length %d", + rpc->id, rpc->msgin.granted, rpc->msgin.recv_end, + rpc->msgin.length); + INC_METRIC(grant_check_calls, 1); + + needy_rank = INT_MAX; + now = homa_clock(); + homa_grant_update_incoming(rpc, grant); + if (now >= READ_ONCE(grant->next_recalc)) { + /* Situation 2. */ + locked = 1; + tt_record1("homa_grant_check_rpc acquiring grant lock to fix order (id %d)", + rpc->id); + homa_grant_lock(grant); + grant->next_recalc = now + grant->recalc_cycles; + needy_rank = homa_grant_fix_order(grant); + homa_grant_unlock(grant); + tt_record2("homa_grant_check_rpc released grant lock (id %d, needy_rank %d)", + rpc->id, needy_rank); + INC_METRIC(grant_check_recalcs, 1); + } + + rank = READ_ONCE(rpc->msgin.rank); + stalled_rank = atomic_read(&grant->stalled_rank); + if (stalled_rank < needy_rank) + needy_rank = stalled_rank; + + if (rank >= 0 && rank <= needy_rank) { + int priority; + + /* Fast path. */ + priority = homa_grant_update_granted(rpc, grant); + homa_grant_update_incoming(rpc, grant); + if (priority >= 0) { + homa_grant_cand_init(&cand); + if (rpc->msgin.granted >= rpc->msgin.length) + homa_grant_unmanage_rpc(rpc, &cand); + + /* Sending a grant is slow, so release the RPC lock while + * sending the grant to reduce contention. + */ + homa_rpc_unlock(rpc); + homa_grant_send(rpc, priority); + if (!homa_grant_cand_empty(&cand)) + homa_grant_cand_check(&cand, grant); + homa_grant_check_fifo(grant); + homa_rpc_lock(rpc); + } + } + + if (needy_rank < INT_MAX && + atomic_read(&grant->total_incoming) < grant->max_incoming) { + UNIT_HOOK("grant_check_needy"); + /* Situations 1 and 2. */ + stalled_rank = atomic_xchg(&grant->stalled_rank, INT_MAX); + if (stalled_rank < needy_rank) + needy_rank = stalled_rank; + homa_grant_cand_init(&cand); + locked = 1; + tt_record3("homa_grant_check_rpc acquiring grant lock, needy_rank %d, id %d, num_active %d", + needy_rank, rpc->id, grant->num_active_rpcs); + homa_grant_lock(grant); + for (i = needy_rank; i < grant->num_active_rpcs; i++) { + struct homa_rpc *rpc2 = grant->active_rpcs[i]; + + if (rpc2->msgin.rec_incoming < grant->window && + rpc2->state != RPC_DEAD) + homa_grant_cand_add(&cand, rpc2); + } + homa_grant_unlock(grant); + tt_record1("homa_grant_check_rpc released grant lock (id %d)", + rpc->id); + if (!homa_grant_cand_empty(&cand)) { + homa_rpc_unlock(rpc); + homa_grant_cand_check(&cand, grant); + homa_rpc_lock(rpc); + } + INC_METRIC(grant_check_others, 1); + } + + INC_METRIC(grant_check_locked, locked); + tt_record2("homa_grant_check_rpc finished with id %d, total_incoming %d", + rpc->id, atomic_read(&grant->total_incoming)); +} + +/** + * homa_grant_fix_order() - This function scans all of the RPCS in + * @active_rpcs and repairs any priority inversions that may exist. + * @grant: Overall grant management information. + * Return: The new rank of the highest-priority RPC whose rank improved, + * or INT_MAX if no RPCs were promoted. + */ +int homa_grant_fix_order(struct homa_grant *grant) + __must_hold(grant->lock) +{ + struct homa_rpc *rpc, *other; + int result = INT_MAX; + int i, j; + + for (i = 1; i < grant->num_active_rpcs; i++) { + rpc = grant->active_rpcs[i]; + for (j = i - 1; j >= 0; j--) { + other = grant->active_rpcs[j]; + if (!homa_grant_outranks(rpc, other)) + break; + grant->active_rpcs[j + 1] = other; + other->msgin.rank = j + 1; + grant->active_rpcs[j] = rpc; + rpc->msgin.rank = j; + if (j < result) + result = j; + INC_METRIC(grant_priority_bumps, 1); + } + } + return result; +} + +/** + * homa_grant_find_oldest() - Recompute the value of homa->grant->oldest_rpc. + * @grant: Overall grant management information. @grant->oldest_rpc + * must be NULL. + */ +void homa_grant_find_oldest(struct homa_grant *grant) + __must_hold(grant->lock) +{ + int max_incoming = grant->window + 2 * grant->fifo_grant_increment; + struct homa_rpc *rpc, *oldest; + struct homa_peer *peer; + u64 oldest_birth; + int i; + + oldest = NULL; + oldest_birth = ~0; + + /* Check the grantable lists. */ + list_for_each_entry(peer, &grant->grantable_peers, grantable_links) { + list_for_each_entry(rpc, &peer->grantable_rpcs, + grantable_links) { + if (rpc->msgin.birth >= oldest_birth) + continue; + if (rpc->msgin.rec_incoming >= max_incoming) { + /* This RPC has been granted way more bytes + * than the grant window. This can only + * happen for FIFO grants, and it means the + * peer isn't responding to grants we've sent. + * Pick a different "oldest" RPC. + */ + continue; + } + oldest = rpc; + oldest_birth = rpc->msgin.birth; + } + } + + /* Check the active RPCs (skip the highest priority one, since + * it is already getting lots of grants). + */ + for (i = 1; i < grant->num_active_rpcs; i++) { + rpc = grant->active_rpcs[i]; + if (rpc->msgin.birth >= oldest_birth) + continue; + if (rpc->msgin.rec_incoming >= max_incoming) + continue; + oldest = rpc; + oldest_birth = rpc->msgin.birth; + } + + if (oldest) { + homa_rpc_hold(oldest); + tt_record1("homa_grant_find_oldest chose id %d", oldest->id); + } + grant->oldest_rpc = oldest; +} + +/** + * homa_grant_promote_rpc() - This function is invoked when the grant priority + * of an RPC has increased (e.g., because it received a FIFO grant); it adjusts + * the position of the RPC within the grantable lists and may promote it into + * grant->active_rpcs. This function does not promote within grant->active_rpcs: + * that is handled by homa_grant_fix_order. + * @grant: Overall grant management information. + * @rpc: The RPC to consider for promotion. Must currently be managed for + * grants. + */ +void homa_grant_promote_rpc(struct homa_grant *grant, struct homa_rpc *rpc) + __must_hold(rpc->bucket->lock) +{ + struct homa_peer *peer = rpc->peer; + struct homa_rpc *other, *bumped; + + homa_grant_lock(grant); + if (rpc->msgin.rank >= 0) + goto done; + + /* Promote into active_rpcs if appropriate. */ + if (grant->num_active_rpcs < grant->max_overcommit || + homa_grant_outranks(rpc, grant->active_rpcs[grant->num_active_rpcs - + 1])) { + homa_grant_remove_grantable(rpc); + bumped = homa_grant_insert_active(rpc); + if (bumped) + homa_grant_insert_grantable(bumped); + goto done; + } + + /* Promote within the grantable lists. */ + while (rpc != list_first_entry(&peer->grantable_rpcs, + struct homa_rpc, grantable_links)) { + other = list_prev_entry(rpc, grantable_links); + if (!homa_grant_outranks(rpc, other)) + goto done; + list_del(&rpc->grantable_links); + list_add_tail(&rpc->grantable_links, &other->grantable_links); + } + + /* The RPC is now at the head of its peer list, so the peer may need + * to be promoted also. + */ + homa_grant_adjust_peer(grant, peer); + +done: + homa_grant_unlock(grant); +} + +/** + * homa_grant_check_fifo() - Check to see if it is time to make the next + * FIFO grant; if so, make the grant. FIFO grants keep long messages from + * being starved by Homa's SRPT grant mechanism. + * @grant: Overall grant management information. + */ +void homa_grant_check_fifo(struct homa_grant *grant) +{ + struct homa_grant_candidates cand; + struct homa_rpc *rpc; + u64 now; + + /* Note: placing this check before locking saves lock overhead + * in the normal case where it's not yet time for the next FIFO + * grant. This results in a race (2 cores could simultaneously + * decide to make FIFO grants) but that is relatively harmless + * (an occasional extra FIFO grant). + */ + now = homa_clock(); + if (now < grant->fifo_grant_time) + return; + homa_grant_lock(grant); + grant->fifo_grant_time = now + grant->fifo_grant_interval; + if (grant->fifo_fraction == 0 || grant->fifo_grant_increment == 0) { + homa_grant_unlock(grant); + return; + } + + /* See if there is an RPC to grant. */ + rpc = grant->oldest_rpc; + if (rpc) { + /* If the oldest RPC hasn't been responding to FIFO grants + * then switch to a different RPC. + */ + int max_incoming = grant->window + 2 * + grant->fifo_grant_increment; + if (rpc->msgin.rec_incoming >= max_incoming) { + grant->oldest_rpc = NULL; + homa_rpc_put(rpc); + rpc = NULL; + } + } + if (!rpc) { + homa_grant_find_oldest(grant); + rpc = grant->oldest_rpc; + if (!rpc) { + homa_grant_unlock(grant); + return; + } + } + + /* Trickiness here: must release the grant lock before acquiring + * the RPC lock. Must acquire a reference on the RPC to keep it + * from being deleted in the gap where no lock is held. + */ + homa_rpc_hold(rpc); + homa_grant_unlock(grant); + homa_rpc_lock(rpc); + if (rpc->state == RPC_DEAD) { + homa_rpc_unlock(rpc); + homa_rpc_put(rpc); + return; + } + homa_grant_cand_init(&cand); + rpc->msgin.granted += grant->fifo_grant_increment; + tt_record2("homa_grant_check_fifo granted %d more bytes to id %d", + grant->fifo_grant_increment, rpc->id); + if (rpc->msgin.granted >= rpc->msgin.length) { + INC_METRIC(fifo_grant_bytes, grant->fifo_grant_increment + + rpc->msgin.length - + rpc->msgin.granted); + rpc->msgin.granted = rpc->msgin.length; + homa_grant_unmanage_rpc(rpc, &cand); + } else { + INC_METRIC(fifo_grant_bytes, grant->fifo_grant_increment); + homa_grant_promote_rpc(grant, rpc); + } + homa_grant_update_incoming(rpc, grant); + homa_rpc_unlock(rpc); + homa_grant_send(rpc, homa_high_priority(grant->homa)); + homa_rpc_put(rpc); + if (!homa_grant_cand_empty(&cand)) + homa_grant_cand_check(&cand, grant); +} + +/** + * homa_grant_cand_add() - Add an RPC into the struct, if there is + * space. After this function is called, homa_grant_cand_check must + * eventually be called to process the entries and release reference + * counts. + * @cand: Structure in which to add @rpc. + * @rpc: RPC to add. If added successfully its reference count will + * be incremented + */ +void homa_grant_cand_add(struct homa_grant_candidates *cand, + struct homa_rpc *rpc) +{ + if (cand->inserts < cand->removes + HOMA_MAX_CAND_RPCS) { + homa_rpc_hold(rpc); + cand->rpcs[cand->inserts & HOMA_CAND_MASK] = rpc; + cand->inserts++; + } +} + +/** + * homa_grant_cand_check() - Scan all of the entries in @cand, issuing + * grants if possible and releasing reference counts. This function + * will acquire each RPCs lock, so the caller must not hold RPC locks + * or locks that conflict with RPC locks, such as the + * grant lock. + * @cand: Check all of the RPCs in this struct. + * @grant: Grant management information. + */ +void homa_grant_cand_check(struct homa_grant_candidates *cand, + struct homa_grant *grant) +{ + struct homa_rpc *rpc; + int priority; + bool locked; + + while (cand->removes < cand->inserts) { + rpc = cand->rpcs[cand->removes & HOMA_CAND_MASK]; + cand->removes++; + homa_rpc_lock(rpc); + locked = true; + + if (rpc->state != RPC_DEAD) { + priority = homa_grant_update_granted(rpc, grant); + if (priority >= 0) { + homa_grant_update_incoming(rpc, grant); + if (rpc->msgin.granted >= rpc->msgin.length) + homa_grant_unmanage_rpc(rpc, cand); + homa_rpc_unlock(rpc); + locked = false; + homa_grant_send(rpc, priority); + } + } + if (locked) + homa_rpc_unlock(rpc); + homa_rpc_put(rpc); + } +} + +/** + * homa_grant_lock_slow() - This function implements the slow path for + * acquiring the grant lock. It is invoked when the lock isn't immediately + * available. It waits for the lock, but also records statistics about + * the waiting time. + * @grant: Grant management information. + */ +void homa_grant_lock_slow(struct homa_grant *grant) + __acquires(grant->lock) +{ + u64 start = homa_clock(); + + tt_record("beginning wait for grant lock"); + spin_lock_bh(&grant->lock); + tt_record("ending wait for grant lock"); + INC_METRIC(grant_lock_misses, 1); + INC_METRIC(grant_lock_miss_cycles, homa_clock() - start); +} + +/** + * homa_grant_update_sysctl_deps() - Invoked whenever a sysctl value is changed; + * updates variables that depend on sysctl-settable values. + * @grant: Structure in which to update information. + */ +void homa_grant_update_sysctl_deps(struct homa_grant *grant) +{ + u64 fifo_mbps, clocks_per_fifo_mbit, interval; + + if (grant->max_overcommit > HOMA_MAX_GRANTS) + grant->max_overcommit = HOMA_MAX_GRANTS; + + if (grant->fifo_fraction > 500) + grant->fifo_fraction = 500; + fifo_mbps = (u64)grant->homa->link_mbps * grant->fifo_fraction; + do_div(fifo_mbps, 1000); + if (fifo_mbps > 0 && grant->fifo_grant_increment > 0) { + clocks_per_fifo_mbit = 1000 * homa_clock_khz(); + do_div(clocks_per_fifo_mbit, fifo_mbps); + interval = clocks_per_fifo_mbit * grant->fifo_grant_increment * + 8; + do_div(interval, 1000000); + grant->fifo_grant_interval = interval; + } else { + grant->fifo_grant_interval = 1000 * homa_clock_khz(); + } + + grant->recalc_cycles = homa_usecs_to_cycles(grant->recalc_usecs); + + grant->window = homa_grant_window(grant); +} + +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_grant_dointvec() - This function is a wrapper around proc_dointvec. It + * is invoked to read and write grant-related sysctl values. + * @table: sysctl table describing value to be read or written. + * @write: Nonzero means value is being written, 0 means read. + * @buffer: Address in user space of the input/output data. + * @lenp: Not exactly sure. + * @ppos: Not exactly sure. + * + * Return: 0 for success, nonzero for error. + */ +int homa_grant_dointvec(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table table_copy; + struct homa_grant *grant; + int result; + + grant = homa_net(current->nsproxy->net_ns)->homa->grant; + + /* Generate a new ctl_table that refers to a field in the + * net-specific struct homa. + */ + table_copy = *table; + table_copy.data = ((char *)grant) + (uintptr_t)table_copy.data; + + result = proc_dointvec(&table_copy, write, buffer, lenp, ppos); + if (write) + homa_grant_update_sysctl_deps(grant); + return result; +} +#endif /* See strip.py */ diff --git a/homa_grant.h b/homa_grant.h new file mode 100644 index 00000000..3fa7a8f8 --- /dev/null +++ b/homa_grant.h @@ -0,0 +1,315 @@ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ + +/* This file contains definitions that related to generating grants. */ + +#ifndef _HOMA_GRANT_H +#define _HOMA_GRANT_H + +#include "homa_rpc.h" + +/** + * define HOMA_MAX_GRANTS - Used to size various data structures for grant + * management; the max_overcommit sysctl parameter must never be greater than + * this. + */ +#define HOMA_MAX_GRANTS 10 + +/** + * struct homa_grant - Holds information used to manage the sending of + * grants for incoming messages. There is one instance of this object + * stored in each struct homa. + */ +struct homa_grant { + /** @homa: The struct homa that this object belongs to. */ + struct homa *homa; + + /** + * @total_incoming: the total number of bytes that we expect to receive + * (across all messages) even if we don't send out any more grants + * (includes granted but unreceived bytes, plus unreceived unscheduled + * bytes that we know about). This can potentially be negative, if + * a peer sends more bytes than granted (see synchronization note in + * homa_send_grants for why we have to allow this possibility). + */ + atomic_t total_incoming; + + /** + * @stalled_rank: rank of the highest-priority RPC (i.e., lowest + * rank) whose incoming message could not be fully granted because + * @total_incoming exceeded @max_incoming. INT_MAX means there are + * no stalled RPCs. + */ + atomic_t stalled_rank; + + /** + * @max_incoming: Homa will try to ensure that the total number of + * bytes senders have permission to send to this host (either + * unscheduled bytes or granted bytes) does not exceeds this value. + * Set externally via sysctl. + */ + int max_incoming; + + /** + * @lock: The grant lock: used to synchronize access to grant-related + * fields below as well as some fields in homa_rpc structs. + */ + spinlock_t lock ____cacheline_aligned_in_smp; + + /** + * @lock_time: homa_clock() time when lock was last locked. Used + * for computing statistics. + */ + u64 lock_time; + + /** + * @num_active_rpcs: Number of entries in @active_rpcs that are + * currently used. + */ + int num_active_rpcs; + + /** + * @active_rpcs: The highest-priority RPCs that still need grants. + * Lower index in the list means higher priority. If an RPC is in + * this array then it is not in @grantable_peers. + */ + struct homa_rpc *active_rpcs[HOMA_MAX_GRANTS]; + + /** + * @grantable_peers: Contains all peers with entries in their + * grantable_rpcs lists. The list is sorted in priority order of + * the highest priority RPC for each peer (fewer ungranted bytes -> + * higher priority). + */ + struct list_head grantable_peers; + + /** + * @num_grantable_rpcs: Total number of RPCs with incoming + * messages that still need grants. Includes entries in both + * @active_rpcs and @grantable_peers. + */ + int num_grantable_rpcs; + + /** + * @last_grantable_change: The homa_clock() time of the most recent + * increment or decrement of num_grantable_rpcs; used for computing + * statistics. + */ + u64 last_grantable_change; + + /** + * @max_grantable_rpcs: The largest value that has been seen for + * num_grantable_rpcs since this value was reset to 0 (it can be + * reset externally using sysctl). + */ + int max_grantable_rpcs; + + /** + * @window: Maximum number of granted but not yet received bytes for + * an incoming message. Computed from @window_param. + */ + int window; + + /** + * @window_param: Set externally via sysctl to select a policy for + * computing grant windows (how much granted but not yet received + * data an incoming message may have). If nonzero, then it specifies + * a (static) size for windows. 0 means compute windows dynamically + * based on the number of RPCs we're currently granting to. + */ + int window_param; + + /** + * @max_rpcs_per_peer: If there are multiple incoming messages from + * the same peer, Homa will only issue grants to this many of them + * at a time. Set externally via sysctl. + */ + int max_rpcs_per_peer; + + /** + * @max_overcommit: The maximum number of messages to which Homa will + * send grants at any given point in time. Set externally via sysctl. + */ + int max_overcommit; + + /** + * @recalc_usecs: Length of the priority recalculation interval, in + * microseconds. Each interval, priorities of the active messages + * get resorted if they have drifted out of order. Set externally + * via sysctl. + */ + int recalc_usecs; + + /** + * @recalc_cycles: Same as @recalc_usec except in homa_clock() units. + */ + int recalc_cycles; + + /** + * @next_recalc: Time in homa_clock() units at which priorities + * should be recalculated. + */ + u64 next_recalc; + + /** + * @fifo_grant_increment: how many additional bytes to grant in + * a "pity" grant sent to the oldest outstanding message. Set + * externally via sysctl. + */ + int fifo_grant_increment; + + /** + * @fifo_fraction: The fraction (in thousandths) of granted + * bytes that should go to the *oldest* incoming message, rather + * than the highest priority ones. Set externally via sysctl. + */ + int fifo_fraction; + + /** + * @fifo_grant_interval: The time (in homa_clock units) between + * successive FIFO grants. + */ + u64 fifo_grant_interval; + + /** + * @fifo_grant_time: The time when we should issue the next FIFO + * grant. + */ + u64 fifo_grant_time; + + /** + * @oldest_rpc: The RPC with incoming data whose start_cycles is + * farthest in the past). NULL means either there are no incoming + * RPCs or the oldest needs to be recomputed. There is always a + * reference taken for this RPC. Must hold grant_lock to update. + */ + struct homa_rpc *oldest_rpc; + +#ifndef __STRIP__ /* See strip.py */ + /** + * @sysctl_header: Used to remove sysctl values when this structure + * is destroyed. + */ + struct ctl_table_header *sysctl_header; +#endif /* See strip.py */ +} ____cacheline_aligned_in_smp; + +/** + * struct homa_grant_candidates() - Accumulates information about RPCs that + * can potentially be issued grants. Used in order to defer the actual + * granting until it is safe to acquire locks for the RPCs. + */ +struct homa_grant_candidates { + /** + * @inserts: Total number of RPCs that have been inserted in this + * structure over its lifetime. Low-order bits indicate where the + * next RPC should be inserted. + */ + u32 inserts; + + /** + * @removes: Total number of RPCs that have been removed from this + * structure over its lifetime. Low-order bits give index of next + * RPC to be checked for possible grant. + */ + u32 removes; + + /* Maximum number of RPCs that can be stored in @rpcs. If space + * runs out some potentially grant-worthy RPCs may be ignored, + * but they will get another chance in a future call to + * homa_grant_check_all. Must be a power of 2. + */ +#define HOMA_MAX_CAND_RPCS 8 +#define HOMA_CAND_MASK (HOMA_MAX_CAND_RPCS - 1) + + /** @rpcs: RPCs that should be considered for sending grants. */ + struct homa_rpc *rpcs[HOMA_MAX_CAND_RPCS]; + +}; + +struct homa_grant + *homa_grant_alloc(struct homa *homa); +void homa_grant_adjust_peer(struct homa_grant *grant, + struct homa_peer *peer); +void homa_grant_cand_add(struct homa_grant_candidates *cand, + struct homa_rpc *rpc); +void homa_grant_cand_check(struct homa_grant_candidates *cand, + struct homa_grant *grant); +void homa_grant_check_fifo(struct homa_grant *grant); +void homa_grant_check_rpc(struct homa_rpc *rpc); +int homa_grant_dointvec(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +void homa_grant_end_rpc(struct homa_rpc *rpc); +void homa_grant_find_oldest(struct homa_grant *grant); +int homa_grant_fix_order(struct homa_grant *grant); +void homa_grant_free(struct homa_grant *grant); +void homa_grant_init_rpc(struct homa_rpc *rpc, int unsched); +struct homa_rpc + *homa_grant_insert_active(struct homa_rpc *rpc); +void homa_grant_insert_grantable(struct homa_rpc *rpc); +void homa_grant_manage_rpc(struct homa_rpc *rpc); +void homa_grant_lock_slow(struct homa_grant *grant); +void homa_grant_log_tt(struct homa *homa); +int homa_grant_outranks(struct homa_rpc *rpc1, + struct homa_rpc *rpc2); +void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc); +int homa_grant_priority(struct homa *homa, int rank); +void homa_grant_promote_rpc(struct homa_grant *grant, struct homa_rpc *rpc); +void homa_grant_remove_active(struct homa_rpc *rpc, + struct homa_grant_candidates *cand); +void homa_grant_remove_grantable(struct homa_rpc *rpc); +void homa_grant_send(struct homa_rpc *rpc, int priority); +void homa_grant_unmanage_rpc(struct homa_rpc *rpc, + struct homa_grant_candidates *cand); +int homa_grant_update_granted(struct homa_rpc *rpc, + struct homa_grant *grant); +void homa_grant_update_incoming(struct homa_rpc *rpc, + struct homa_grant *grant); +void homa_grant_update_sysctl_deps(struct homa_grant *grant); +int homa_grant_window(struct homa_grant *grant); + +/** + * homa_grant_cand_init() - Reset @cand to an empty state. + * @cand: Structure to initialize. + */ +static inline void homa_grant_cand_init(struct homa_grant_candidates *cand) +{ + cand->inserts = 0; + cand->removes = 0; +} + +/** + * homa_grant_cand_empty() - Returns true if there are no RPCs in @cand, + * false otherwise + * @cand: Structure to check. + * Return: See above. + */ +static inline bool homa_grant_cand_empty(struct homa_grant_candidates *cand) +{ + return cand->inserts == cand->removes; +} + +/** + * homa_grant_lock() - Acquire the grant lock. If the lock + * isn't immediately available, record stats on the waiting time. + * @grant: Grant management info. + */ +static inline void homa_grant_lock(struct homa_grant *grant) + __acquires(grant->lock) +{ + if (!spin_trylock_bh(&grant->lock)) + homa_grant_lock_slow(grant); + grant->lock_time = homa_clock(); +} + +/** + * homa_grant_unlock() - Release the grant lock. + * @grant: Grant management info. + */ +static inline void homa_grant_unlock(struct homa_grant *grant) + __releases(grant->grant_lock) +{ + INC_METRIC(grant_lock_cycles, homa_clock() - grant->lock_time); + spin_unlock_bh(&grant->lock); +} + +#endif /* _HOMA_GRANT_H */ diff --git a/homa_impl.h b/homa_impl.h index 273ee9d4..61c4c912 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -1,17 +1,4 @@ -/* Copyright (c) 2019-2023 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file contains definitions that are shared across the files * that implement Homa for Linux. @@ -20,13 +7,10 @@ #ifndef _HOMA_IMPL_H #define _HOMA_IMPL_H -#pragma GCC diagnostic ignored "-Wpointer-sign" -#pragma GCC diagnostic ignored "-Wunused-variable" - #include #ifdef __UNIT_TEST__ #undef WARN -#define WARN(condition, format...) +#define WARN(...) #undef WARN_ON #define WARN_ON(condition) ({ \ @@ -36,7 +20,7 @@ #undef WARN_ON_ONCE #define WARN_ON_ONCE(condition) WARN_ON(condition) -#endif +#endif /* __UNIT_TEST__ */ #include #include @@ -49,2570 +33,504 @@ #include #include #include -#include #include +#include #include #include +#include #include #include #include -#pragma GCC diagnostic warning "-Wpointer-sign" -#pragma GCC diagnostic warning "-Wunused-variable" +#include -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,16,0) -typedef unsigned int __poll_t; -#endif +#ifndef __UPSTREAM__ /* See strip.py */ +#include "homa.h" +#include +#include "homa_devel.h" +#else /* See strip.py */ +#include +#endif /* See strip.py */ +#include "homa_wire.h" #ifdef __UNIT_TEST__ -#define spin_unlock mock_spin_unlock -extern void mock_spin_unlock(spinlock_t *lock); - -#undef get_cycles -#define get_cycles mock_get_cycles -extern cycles_t mock_get_cycles(void); - -#define signal_pending(xxx) mock_signal_pending -extern int mock_signal_pending; - -#define rcu_read_lock mock_rcu_read_lock -extern void mock_rcu_read_lock(void); - -#define rcu_read_unlock mock_rcu_read_unlock -extern void mock_rcu_read_unlock(void); - -#undef current -#define current current_task - -#define kthread_complete_and_exit(comp, code) - -#define kmalloc mock_kmalloc -extern void *mock_kmalloc(size_t size, gfp_t flags); +#include "mock.h" +#endif /* __UNIT_TEST__ */ + +#ifndef __STRIP__ /* See strip.py */ +/* Null out things that confuse VSCode Intellisense */ +#ifdef __VSCODE__ +#define smp_processor_id() 1 +#define BUG() +#define BUG_ON(...) +#define set_current_state(...) #endif - -#include "homa.h" -#include "timetrace.h" +#endif /* See strip.py */ /* Forward declarations. */ -struct homa_sock; -struct homa_rpc; struct homa; struct homa_peer; -struct homa_lcache; - -/* Declarations used in this file, so they can't be made at the end. */ -extern void homa_grantable_lock_slow(struct homa *homa); -extern void homa_peer_lock_slow(struct homa_peer *peer); -extern void homa_rpc_lock_slow(struct homa_rpc *rpc); -extern void homa_sock_lock_slow(struct homa_sock *hsk); -extern void homa_throttle_lock_slow(struct homa *homa); - -/** - * enum homa_packet_type - Defines the possible types of Homa packets. - * - * See the xxx_header structs below for more information about each type. - */ -enum homa_packet_type { - DATA = 0x10, - GRANT = 0x11, - RESEND = 0x12, - UNKNOWN = 0x13, - BUSY = 0x14, - CUTOFFS = 0x15, - FREEZE = 0x16, - NEED_ACK = 0x17, - ACK = 0x18, - BOGUS = 0x19, /* Used only in unit tests. */ - /* If you add a new type here, you must also do the following: - * 1. Change BOGUS so it is the highest opcode - * 2. Add support for the new opcode in homa_print_packet, - * homa_print_packet_short, homa_symbol_for_type, and mock_skb_new. - * 3. Add the header length to header_lengths in homa_plumbing.c. - */ -}; - -/** define HOMA_IPV6_HEADER_LENGTH - Size of IP header (V6). */ -#define HOMA_IPV6_HEADER_LENGTH 40 - -/** define HOMA_IPV4_HEADER_LENGTH - Size of IP header (V4). */ -#define HOMA_IPV4_HEADER_LENGTH 20 - -/** - * define HOMA_SKB_EXTRA - How many bytes of additional space to allow at the - * beginning of each sk_buff, before the IP header. This includes room for a - * VLAN header and also includes some extra space, "just to be safe" (not - * really sure if this is needed). - */ -#define HOMA_SKB_EXTRA 40 - -/** - * define HOMA_ETH_OVERHEAD - Number of bytes per Ethernet packet for CRC, - * preamble, and inter-packet gap. - */ -#define HOMA_ETH_OVERHEAD 24 - -/** - * define HOMA_MIN_PKT_LENGTH - Every Homa packet must be padded to at least - * this length to meet Ethernet frame size limitations. This number includes - * Homa headers and data, but not IP or Ethernet headers. - */ -#define HOMA_MIN_PKT_LENGTH 26 +struct homa_rpc; +struct homa_sock; -/** - * define HOMA_MAX_HEADER - Number of bytes in the largest Homa header. - */ -#define HOMA_MAX_HEADER 90 +#ifndef __STRIP__ /* See strip.py */ +#include "timetrace.h" +#include "homa_metrics.h" -/** - * define ETHERNET_MAX_PAYLOAD - Maximum length of an Ethernet packet, - * excluding preamble, frame delimeter, VLAN header, CRC, and interpacket gap; - * i.e. all of this space is available for Homa. - */ -#define ETHERNET_MAX_PAYLOAD 1500 +/* Declarations used in this file, so they can't be made at the end. */ +void homa_throttle_lock_slow(struct homa *homa); +#endif /* See strip.py */ /** - * define HOMA_MAX_PRIORITIES - The maximum number of priority levels that - * Homa can use (the actual number can be restricted to less than this at - * runtime). Changing this value will affect packet formats. + * union sockaddr_in_union - Holds either an IPv4 or IPv6 address (smaller + * and easier to use than sockaddr_storage). */ -#define HOMA_MAX_PRIORITIES 8 +union sockaddr_in_union { + /** @sa: Used to access as a generic sockaddr. */ + struct sockaddr sa; -#define sizeof32(type) ((int) (sizeof(type))) + /** @in4: Used to access as IPv4 socket. */ + struct sockaddr_in in4; -/** define CACHE_LINE_SIZE - The number of bytes in a cache line. */ -#define CACHE_LINE_SIZE 64 - -/** - * define NUM_PEER_UNACKED_IDS - The number of ids for unacked RPCs that - * can be stored in a struct homa_peer. - */ -#define NUM_PEER_UNACKED_IDS 5 - -/** - * struct homa_cache_line - An object whose size equals that of a cache line. - */ -struct homa_cache_line { - char bytes[64]; + /** @in6: Used to access as IPv6 socket. */ + struct sockaddr_in6 in6; }; /** - * struct common_header - Wire format for the first bytes in every Homa - * packet. This must partially match the format of a TCP header so that - * Homa can piggyback on TCP segmentation offload (and possibly other - * features, such as RSS). + * struct homa - Stores overall information about the Homa transport, which + * is shared across all Homa sockets and all network namespaces. */ -struct common_header { +struct homa { /** - * @sport: Port on source machine from which packet was sent. - * Must be in the same position as in a TCP header. + * @next_outgoing_id: Id to use for next outgoing RPC request. + * This is always even: it's used only to generate client-side ids. + * Accessed without locks. Note: RPC ids are unique within a + * single client machine. */ - __be16 sport; + atomic64_t next_outgoing_id; - /** - * @dport: Port on destination that is to receive packet. Must be - * in the same position as in a TCP header. - */ - __be16 dport; +#ifndef __UPSTREAM__ /* See strip.py */ + /** @qshared: Contains information used by homa_qdisc.c. */ + struct homa_qdisc_shared *qshared; +#endif /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ /** - * @unused1: corresponds to the sequence number field in TCP headers; - * must not be used by Homa, in case it gets incremented during TCP - * offload. + * @pacer: Information related to the pacer; managed by homa_pacer.c. */ - __be32 unused1; - - __be32 unused2; + struct homa_pacer *pacer; /** - * @doff: High order 4 bits holds the number of 4-byte chunks in a - * data_header (low-order bits unused). Used only for DATA packets; - * must be in the same position as the data offset in a TCP header. + * @grant: Contains information used by homa_grant.c to manage + * grants for incoming messages. */ - __u8 doff; - - /** @type: One of the values of &enum packet_type. */ - __u8 type; - - __u16 unused3; - - /** - * @checksum: not used by Homa, but must occupy the same bytes as - * the checksum in a TCP header (TSO may modify this?).*/ - __be16 checksum; - - __u16 unused4; + struct homa_grant *grant; +#endif /* See strip.py */ /** - * @sender_id: the identifier of this RPC as used on the sender (i.e., - * if the low-order bit is set, then the sender is the server for - * this RPC). + * @peertab: Info about all the other hosts we have communicated with; + * includes peers from all network namespaces. */ - __be64 sender_id; -} __attribute__((packed)); + struct homa_peertab *peertab; -/** - * struct homa_ack - Identifies an RPC that can be safely deleted by its - * server. After sending the response for an RPC, the server must retain its - * state for the RPC until it knows that the client has successfully - * received the entire response. An ack indicates this. Clients will - * piggyback acks on future data packets, but if a client doesn't send - * any data to the server, the server will eventually request an ack - * explicitly with a NEED_ACK packet, in which case the client will - * return an explicit ACK. - */ -struct homa_ack { /** - * @id: The client's identifier for the RPC. 0 means this ack - * is invalid. + * @socktab: Information about all open sockets. Dynamically + * allocated; must be kfreed. */ - __be64 client_id; - - /** @client_port: The client-side port for the RPC. */ - __be16 client_port; - - /** @server_port: The server-side port for the RPC. */ - __be16 server_port; -} __attribute__((packed)); + struct homa_socktab *socktab; -/** - * struct data_segment - Wire format for a chunk of data that is part of - * a DATA packet. A single sk_buff can hold multiple data_segments in order - * to enable send and receive offload (the idea is to carry many network - * packets of info in a single traversal of the Linux networking stack). - * A DATA sk_buff contains a data_header followed by any number of - * data_segments. - */ -struct data_segment { +#ifndef __STRIP__ /* See strip.py */ /** - * @offset: Offset within message of the first byte of data in - * this segment. Segments within an sk_buff are not guaranteed - * to be in order. + * @page_pool_mutex: Synchronizes access to any/all of the page_pools + * used for outgoing sk_buff data. */ - __be32 offset; - - /** @segment_length: Number of bytes of data in this segment. */ - __be32 segment_length; + spinlock_t page_pool_mutex ____cacheline_aligned_in_smp; - /** @ack: If the @client_id field is nonzero, provides info about - * an RPC that the recipient can now safely free. + /** + * @page_pools: One page pool for each NUMA node on the machine. + * If there are no cores for node, then this value is NULL. */ - struct homa_ack ack; - - /** @data: the payload of this segment. */ - char data[0]; -} __attribute__((packed)); + struct homa_page_pool *page_pools[MAX_NUMNODES]; +#endif /* See strip.py */ -/* struct data_header - Overall header format for a DATA sk_buff, which - * contains this header followed by any number of data_segments. - */ -struct data_header { - struct common_header common; - - /** @message_length: Total #bytes in the *message* */ - __be32 message_length; + /** @max_numa: Highest NUMA node id in use by any core. */ + int max_numa; +#ifndef __STRIP__ /* See strip.py */ /** - * @incoming: The receiver can expect the sender to send all of the - * bytes in the message up to at least this offset (exclusive), - * even without additional grants. This includes unscheduled - * bytes, granted bytes, plus any additional bytes the sender - * transmits unilaterally (e.g., to send batches, such as with GSO). + * @skb_page_frees_per_sec: Rate at which to return pages from sk_buff + * page pools back to Linux. This is the total rate across all pools. + * Set externally via sysctl. */ - __be32 incoming; + int skb_page_frees_per_sec; /** - * @cutoff_version: The cutoff_version from the most recent - * CUTOFFS packet that the source of this packet has received - * from the destination of this packet, or 0 if the source hasn't - * yet received a CUTOFFS packet. + * @skb_pages_to_free: Space in which to collect pages that are + * about to be released. Dynamically allocated. */ - __be16 cutoff_version; + struct page **skb_pages_to_free; /** - * @retransmit: 1 means this packet was sent in response to a RESEND - * (it has already been sent previously). + * @pages_to_free_slots: Maximum number of pages that can be + * stored in skb_pages_to_free; */ - __u8 retransmit; - - __u8 pad; - - /** @seg: First of possibly many segments */ - struct data_segment seg; -} __attribute__((packed)); -_Static_assert(sizeof(struct data_header) <= HOMA_MAX_HEADER, - "data_header too large for HOMA_MAX_HEADER; must " - "adjust HOMA_MAX_HEADER"); -_Static_assert(sizeof(struct data_header) >= HOMA_MIN_PKT_LENGTH, - "data_header too small: Homa doesn't currently have code" - "to pad data packets"); -_Static_assert(((sizeof(struct data_header) - sizeof(struct data_segment)) - & 0x3) == 0, - " data_header length not a multiple of 4 bytes (required " - "for TCP/TSO compatibility"); - -/** - * struct grant_header - Wire format for GRANT packets, which are sent by - * the receiver back to the sender to indicate that the sender may transmit - * additional bytes in the message. - */ -struct grant_header { - /** @common: Fields common to all packet types. */ - struct common_header common; + int pages_to_free_slots; /** - * @offset: Byte offset within the message. - * - * The sender should now transmit all data up to (but not including) - * this offset ASAP, if it hasn't already. + * @skb_page_free_time: homa_clock() time when the next sk_buff + * page should be freed. Could be in the past. */ - __be32 offset; + u64 skb_page_free_time; /** - * @priority: The sender should use this priority level for all future - * MESSAGE_FRAG packets for this message, until a GRANT is received - * with higher offset. Larger numbers indicate higher priorities. + * @skb_page_pool_min_kb: Don't return pages from a pool to Linux + * if the amount of unused space in the pool has been less than this + * many KBytes at any time in the recent past. Set externally via + * sysctl. */ - __u8 priority; -} __attribute__((packed)); -_Static_assert(sizeof(struct grant_header) <= HOMA_MAX_HEADER, - "grant_header too large for HOMA_MAX_HEADER; must " - "adjust HOMA_MAX_HEADER"); - -/** - * struct resend_header - Wire format for RESEND packets. - * - * A RESEND is sent by the receiver when it believes that message data may - * have been lost in transmission (or if it is concerned that the sender may - * have crashed). The receiver should resend the specified portion of the - * message, even if it already sent it previously. - */ -struct resend_header { - /** @common: Fields common to all packet types. */ - struct common_header common; + int skb_page_pool_min_kb; /** - * @offset: Offset within the message of the first byte of data that - * should be retransmitted. + * @unsched_bytes: The number of bytes that may be sent in a + * new message without receiving any grants. There used to be a + * variable rtt_bytes that served this purpose, and was also used + * for window. Historically, rtt_bytes was intended to be the amount + * of data that can be transmitted over the wire in the time it + * takes to send a full-size data packet and receive back a grant. + * But, for fast networks that value could result in too much + * buffer utilization (and, we wanted to have separate values for + * @unsched_bytes and @window). Set externally via sysctl. */ - __be32 offset; + int unsched_bytes; +#endif /* See strip.py */ /** - * @length: Number of bytes of data to retransmit; this could specify - * a range longer than the total message size. Zero is a special case - * used by servers; in this case, there is no need to actually resend - * anything; the purpose of this packet is to trigger an UNKNOWN - * response if the client no longer cares about this RPC. + * @link_mbps: The raw bandwidth of the network uplink, in + * units of 1e06 bits per second. Set externally via sysctl. */ - __be32 length; + int link_mbps; +#ifndef __STRIP__ /* See strip.py */ /** - * @priority: Packet priority to use. - * - * The sender should transmit all the requested data using this - * priority. + * @poll_usecs: Amount of time (in microseconds) that a thread + * will spend busy-waiting for an incoming messages before + * going to sleep. Set externally via sysctl. */ - __u8 priority; -} __attribute__((packed)); -_Static_assert(sizeof(struct resend_header) <= HOMA_MAX_HEADER, - "resend_header too large for HOMA_MAX_HEADER; must " - "adjust HOMA_MAX_HEADER"); - -/** - * struct unknown_header - Wire format for UNKNOWN packets. - * - * An UNKNOWN packet is sent by either server or client when it receives a - * packet for an RPC that is unknown to it. When a client receives an - * UNKNOWN packet it will typically restart the RPC from the beginning; - * when a server receives an UNKNOWN packet it will typically discard its - * state for the RPC. - */ -struct unknown_header { - /** @common: Fields common to all packet types. */ - struct common_header common; -} __attribute__((packed)); -_Static_assert(sizeof(struct unknown_header) <= HOMA_MAX_HEADER, - "unknown_header too large for HOMA_MAX_HEADER; must " - "adjust HOMA_MAX_HEADER"); - -/** - * struct busy_header - Wire format for BUSY packets. - * - * These packets tell the recipient that the sender is still alive (even if - * it isn't sending data expected by the recipient). - */ -struct busy_header { - /** @common: Fields common to all packet types. */ - struct common_header common; -} __attribute__((packed)); -_Static_assert(sizeof(struct busy_header) <= HOMA_MAX_HEADER, - "busy_header too large for HOMA_MAX_HEADER; must " - "adjust HOMA_MAX_HEADER"); + int poll_usecs; -/** - * struct cutoffs_header - Wire format for CUTOFFS packets. - * - * These packets tell the recipient how to assign priorities to - * unscheduled packets. - */ -struct cutoffs_header { - /** @common: Fields common to all packet types. */ - struct common_header common; + /** @poll_cycles: Same as poll_usecs except in homa_clock() units. */ + u64 poll_cycles; /** - * @unsched_cutoffs: priorities to use for unscheduled packets - * sent to the sender of this packet. See documentation for - * @homa.unsched_cutoffs for the meanings of these values. + * @num_priorities: The total number of priority levels available for + * Homa's use. Internally, Homa will use priorities from 0 to + * num_priorities-1, inclusive. Set externally via sysctl. */ - __be32 unsched_cutoffs[HOMA_MAX_PRIORITIES]; + int num_priorities; /** - * @cutoff_version: unique identifier associated with @unsched_cutoffs. - * Must be included in future DATA packets sent to the sender of - * this packet. + * @priority_map: entry i gives the value to store in the high-order + * 3 bits of the DSCP field of IP headers to implement priority level + * i. Set externally via sysctl. */ - __be16 cutoff_version; -} __attribute__((packed)); -_Static_assert(sizeof(struct cutoffs_header) <= HOMA_MAX_HEADER, - "cutoffs_header too large for HOMA_MAX_HEADER; must " - "adjust HOMA_MAX_HEADER"); - -/** - * struct freeze_header - Wire format for FREEZE packets. - * - * These packets tell the recipient to freeze its timetrace; used - * for debugging. - */ -struct freeze_header { - /** @common: Fields common to all packet types. */ - struct common_header common; -} __attribute__((packed)); -_Static_assert(sizeof(struct freeze_header) <= HOMA_MAX_HEADER, - "freeze_header too large for HOMA_MAX_HEADER; must " - "adjust HOMA_MAX_HEADER"); - -/** - * struct need_ack_header - Wire format for NEED_ACK packets. - * - * These packets ask the recipient (a client) to return an ACK message if - * the packet's RPC is no longer active. - */ -struct need_ack_header { - /** @common: Fields common to all packet types. */ - struct common_header common; -} __attribute__((packed)); -_Static_assert(sizeof(struct need_ack_header) <= HOMA_MAX_HEADER, - "need_ack_header too large for HOMA_MAX_HEADER; must " - "adjust HOMA_MAX_HEADER"); - -/** - * struct ack_header - Wire format for ACK packets. - * - * These packets are sent from a client to a server to indicate that - * a set of RPCs is no longer active on the client, so the server can - * free any state it may have for them. - */ -struct ack_header { - /** @common: Fields common to all packet types. */ - struct common_header common; - - /** @num_acks: number of (leading) elements in @acks that are valid. */ - __be16 num_acks; - - struct homa_ack acks[NUM_PEER_UNACKED_IDS]; -} __attribute__((packed)); -_Static_assert(sizeof(struct ack_header) <= HOMA_MAX_HEADER, - "ack_header too large for HOMA_MAX_HEADER; must " - "adjust HOMA_MAX_HEADER"); + int priority_map[HOMA_MAX_PRIORITIES]; -/** - * struct homa_message_out - Describes a message (either request or response) - * for which this machine is the sender. - */ -struct homa_message_out { /** - * @length: Total bytes in message (excluding headers). A value - * less than 0 means this structure is uninitialized and therefore - * not in use (all other fields will be zero in this case). + * @max_sched_prio: The highest priority level currently available for + * scheduled packets. Levels above this are reserved for unscheduled + * packets. Set externally via sysctl. */ - int length; - - /** @num_skbs: Total number of buffers currently in @packets. */ - int num_skbs; + int max_sched_prio; /** - * @packets: Singly-linked list of all packets in message, linked - * using homa_next_skb. The list is in order of offset in the message - * (offset 0 first); each sk_buff can potentially contain multiple - * data_segments, which will be split into separate packets by GSO. - * This list grows gradually as data is copied in from user space, - * so it may not be complete. + * @unsched_cutoffs: the current priority assignments for incoming + * unscheduled packets. The value of entry i is the largest + * message size that uses priority i (larger i is higher priority). + * If entry i has a value of HOMA_MAX_MESSAGE_SIZE or greater, then + * priority levels less than i will not be used for unscheduled + * packets. At least one entry in the array must have a value of + * HOMA_MAX_MESSAGE_SIZE or greater (entry 0 is usually INT_MAX). + * Set externally via sysctl. */ - struct sk_buff *packets; + int unsched_cutoffs[HOMA_MAX_PRIORITIES]; /** - * @next_xmit: Pointer to pointer to next packet to transmit (will - * either refer to @packets or homa_next_skb(skb) for some skb - * in @packets). + * @cutoff_version: increments every time unsched_cutoffs is + * modified. Used to determine when we need to send updates to + * peers. Note: 16 bits should be fine for this: the worst + * that happens is a peer has a super-stale value that equals + * our current value, so the peer uses suboptimal cutoffs until the + * next version change. Can be set externally via sysctl. */ - struct sk_buff **next_xmit; + int cutoff_version; +#endif /* See strip.py */ /** - * @next_xmit_offset: All bytes in the message, up to but not - * including this one, have been transmitted. + * @resend_ticks: When an RPC's @silent_ticks reaches this value, + * start sending RESEND requests. */ - int next_xmit_offset; + int resend_ticks; /** - * @active_xmits: The number of threads that are currently - * transmitting data packets for this RPC; can't reap the RPC - * until this count becomes zero. + * @resend_interval: minimum number of homa timer ticks between + * RESENDs for the same RPC. */ - atomic_t active_xmits; + int resend_interval; - /** @gso_pkt_data: Number of bytes of message data in each packet - * of @packets except possibly the last. + /** + * @timeout_ticks: abort an RPC if its silent_ticks reaches this value. */ - int gso_pkt_data; + int timeout_ticks; /** - * @unscheduled: Initial bytes of message that we'll send - * without waiting for grants. + * @timeout_resends: Assume that a server is dead if it has not + * responded after this many RESENDs have been sent to it. */ - int unscheduled; + int timeout_resends; /** - * @granted: Total number of bytes we are currently permitted to - * send, including unscheduled bytes; must wait for grants before - * sending bytes at or beyond this position. Never larger than - * @length. + * @request_ack_ticks: How many timer ticks we'll wait for the + * client to ack an RPC before explicitly requesting an ack. + * Set externally via sysctl. */ - int granted; - - /** @priority: Priority level to use for future scheduled packets. */ - __u8 sched_priority; + int request_ack_ticks; /** - * @init_cycles: Time in get_cycles units when this structure was - * initialized. Used to find the oldest outgoing message. + * @reap_limit: Maximum number of packet buffers to free in a + * single call to home_rpc_reap. */ - __u64 init_cycles; -}; + int reap_limit; -/** - * struct homa_message_in - Holds the state of a message received by - * this machine; used for both requests and responses. - */ -struct homa_message_in { /** - * @total_length: Size of the entire message, in bytes. A value - * less than 0 means this structure is uninitialized and therefore - * not in use. + * @dead_buffs_limit: If the number of packet buffers in dead but + * not yet reaped RPCs is less than this number, then Homa reaps + * RPCs in a way that minimizes impact on performance but may permit + * dead RPCs to accumulate. If the number of dead packet buffers + * exceeds this value, then Homa switches to a more aggressive approach + * to reaping RPCs. Set externally via sysctl. */ - int total_length; + int dead_buffs_limit; /** - * @packets: DATA packets received for this message so far. The list - * is sorted in order of offset (head is lowest offset), but - * packets can be received out of order, so there may be times - * when there are holes in the list. Packets in this list contain - * exactly one data_segment. Packets on this list are removed from - * this list and freed once all of their data has been copied - * out to a user buffer. + * @max_dead_buffs: The largest aggregate number of packet buffers + * in dead (but not yet reaped) RPCs that has existed so far in a + * single socket. Readable via sysctl, and may be reset via sysctl + * to begin recalculating. */ - struct sk_buff_head packets; + int max_dead_buffs; +#ifndef __STRIP__ /* See strip.py */ /** - * @num_skbs: Number of buffers currently in @packets. Will be 0 if - * @total_length is less than 0. + * @verbose: Nonzero enables additional logging. Set externally via + * sysctl. */ - int num_skbs; + int verbose; +#endif /* See strip.py */ /** - * @bytes_remaining: Amount of data for this message that has - * not yet been received; will determine the message's priority. + * @max_gso_size: Maximum number of bytes that will be included + * in a single output packet that Homa passes to Linux. Can be set + * externally via sysctl to lower the limit already enforced by Linux. */ - int bytes_remaining; + int max_gso_size; /** - * @incoming: Total # of bytes of the message that the sender will - * transmit without additional grants. Initialized to the number of - * unscheduled bytes; after that, updated only when grants are sent. - * Never larger than @total_length. Note: once initialized, this - * may not be modified without holding @homa->grantable_lock. + * @gso_force_software: A non-zero value will cause Homa to perform + * segmentation in software using GSO; zero means ask the NIC to + * perform TSO. Set externally via sysctl. */ - int incoming; - - /** @priority: Priority level to include in future GRANTS. */ - int priority; + int gso_force_software; /** - * @scheduled: True means some of the bytes of this message - * must be scheduled with grants. + * @wmem_max: Limit on the value of sk_sndbuf for any socket. Set + * externally via sysctl. */ - bool scheduled; + int wmem_max; +#ifndef __STRIP__ /* See strip.py */ /** - * @birth: get_cycles time when this RPC was added to the grantable - * list. Invalid if RPC isn't in the grantable list. + * @hijack_tcp: Non-zero means encapsulate outgoing Homa packets + * as TCP packets (i.e. use TCP as the IP protocol). This makes TSO + * and RSS work better. Set externally via sysctl. */ - __u64 birth; + int hijack_tcp; /** - * @copied_out: All of the bytes of the message with offset less - * than this value have been copied to user-space buffers. + * @max_gro_skbs: Maximum number of socket buffers that can be + * aggregated by the GRO mechanism. Set externally via sysctl. */ - int copied_out; + int max_gro_skbs; /** - * @num_bpages: The number of entries in @bpage_offsets used for this - * message (0 means buffers not allocated yet). + * @gro_policy: An OR'ed together collection of bits that determine + * how Homa packets should be steered for SoftIRQ handling. A value + * of zero will eliminate any Homa-specific behaviors, reverting + * to the Linux defaults. Set externally via sysctl (but modifying + * it is almost certainly a bad idea; see below). */ - __u32 num_bpages; + int gro_policy; - /** @bpage_offsets: Describes buffer space allocated for this message. - * Each entry is an offset from the start of the buffer region. - * All but the last pointer refer to areas of size HOMA_BPAGE_SIZE. + /* Bits that can be specified for gro_policy. These were created for + * testing, in order to evaluate various possible policies; you almost + * certainly should not use any value other than HOMA_GRO_NORMAL. + * HOMA_GRO_SAME_CORE If isolated packets arrive (not part of a + * batch) use the GRO core for SoftIRQ also. + * HOMA_GRO_IDLE Use old mechanism for selecting an idle + * core for SoftIRQ (deprecated). + * HOMA_GRO_NEXT Always use the next core in circular + * order for SoftIRQ (deprecated). + * HOMA_GRO_GEN2 Use the new mechanism for selecting an + * idle core for SoftIRQ. + * HOMA_GRO_FAST_GRANTS Pass all grants immediately to + * homa_softirq during GRO (only if the + * core isn't overloaded). + * HOMA_GRO_SHORT_BYPASS Pass all single-packet messages directly + * to homa_softirq during GRO (only if the + * core isn't overloaded). + * HOMA_GRO_GEN3 Use the "Gen3" mechanisms for load + * balancing. + */ + #define HOMA_GRO_SAME_CORE 2 + #define HOMA_GRO_IDLE 4 + #define HOMA_GRO_NEXT 8 + #define HOMA_GRO_GEN2 0x10 + #define HOMA_GRO_FAST_GRANTS 0x20 + #define HOMA_GRO_SHORT_BYPASS 0x40 + #define HOMA_GRO_GEN3 0x80 + #define HOMA_GRO_NORMAL (HOMA_GRO_SAME_CORE | HOMA_GRO_GEN2 | \ + HOMA_GRO_SHORT_BYPASS | HOMA_GRO_FAST_GRANTS) + + /** + * @busy_usecs: if there has been activity on a core within the + * last @busy_usecs, it is considered to be busy and Homa will + * try to avoid scheduling other activities on the core. See + * balance.txt for more on load balancing. Set externally via sysctl. + */ + int busy_usecs; + + /** @busy_cycles: Same as busy_usecs except in homa_clock() units. */ + int busy_cycles; + + /** + * @gro_busy_usecs: if the gap between the completion of + * homa_gro_receive and the next call to homa_gro_receive on the same + * core is less than this, then GRO on that core is considered to be + * "busy", and optimizations such as HOMA_GRO_SHORT_BYPASS will not be + * done because they risk overloading the core. Set externally via + * sysctl. */ - __u32 bpage_offsets[HOMA_MAX_BPAGES]; -}; + int gro_busy_usecs; -/** - * struct homa_interest - Contains various information used while waiting - * for incoming messages (indicates what kinds of messages a particular - * thread is interested in receiving). - */ -struct homa_interest { /** - * @thread: Thread that would like to receive a message. Will get - * woken up when a suitable message becomes available. + * @gro_busy_cycles: Same as busy_usecs except in homa_clock() units. */ - struct task_struct *thread; + int gro_busy_cycles; +#endif /* See strip.py */ /** - * @ready_rpc: This is actually a (struct homa_rpc *) identifying the - * RPC that was found; NULL if no RPC has been found yet. This - * variable is used for synchronization to handoff the RPC, and - * must be set only after @locked is set. + * @timer_ticks: number of times that homa_timer has been invoked + * (may wraparound, which is safe). */ - atomic_long_t ready_rpc; + u32 timer_ticks; /** - * @locked: Nonzero means that @ready_rpc is locked; only valid - * if @ready_rpc is non-NULL. + * @flags: a collection of bits that can be set using sysctl + * to trigger various behaviors. */ - int locked; + int flags; +#ifndef __STRIP__ /* See strip.py */ /** - * @reg_rpc: RPC whose @interest field points here, or - * NULL if none. + * @freeze_type: determines conditions under which the time trace + * should be frozen. Set externally via sysctl. */ - struct homa_rpc *reg_rpc; + enum homa_freeze_type freeze_type; /** - * @request_links: For linking this object into - * &homa_sock.request_interests. The interest must not be linked - * on either this list or @response_links if @id is nonzero. + * @accept_bits: determines how many consecutive packets will be + * accepted before the next bunch of packets is dropped (intervals + * between dropped packets are chosen uniformly from the + * range[0..1<thread = current; - atomic_long_set(&interest->ready_rpc, 0); - interest->locked = 0; - interest->reg_rpc = NULL; - interest->request_links.next = LIST_POISON1; - interest->response_links.next = LIST_POISON1; -} - -/** - * struct homa_rpc - One of these structures exists for each active - * RPC. The same structure is used to manage both outgoing RPCs on - * clients and incoming RPCs on servers. - */ -struct homa_rpc { - /** @hsk: Socket that owns the RPC. */ - struct homa_sock *hsk; - - /** @lock: Used to synchronize modifications to this structure; - * points to the lock in hsk->client_rpc_buckets or - * hsk->server_rpc_buckets. + * @drop_bits: determines how many consecutive packets are dropped + * when drops occur (counts are chosen uniformly from the + * range [1..1<client_rpc_buckets (for a client RPC), or - * @hsk->server_rpc_buckets (for a server RPC). - */ - struct hlist_node hash_links; - - /** - * @ready_links: Used to link this object into - * &homa_sock.ready_requests or &homa_sock.ready_responses. - */ - struct list_head ready_links; - - /** - * @active_links: For linking this object into @hsk->active_rpcs. - * The next field will be LIST_POISON1 if this RPC hasn't yet been - * linked into @hsk->active_rpcs. Access with RCU. - */ - struct list_head active_links; - - /** @dead_links: For linking this object into @hsk->dead_rpcs. */ - struct list_head dead_links; - - /** - * @interest: Describes a thread that wants to be notified when - * msgin is complete, or NULL if none. - */ - struct homa_interest *interest; - - /** - * @grantable_links: Used to link this RPC into peer->grantable_rpcs. - * If this RPC isn't in peer->grantable_rpcs, this is an empty - * list pointing to itself. - */ - struct list_head grantable_links; - - /** - * @throttled_links: Used to link this RPC into homa->throttled_rpcs. - * If this RPC isn't in homa->throttled_rpcs, this is an empty - * list pointing to itself. - */ - struct list_head throttled_links; - - /** - * @silent_ticks: Number of times homa_timer has been invoked - * since the last time a packet indicating progress was received - * for this RPC, so we don't need to send a resend for a while. - */ - int silent_ticks; - - /** - * @resend_timer_ticks: Value of homa->timer_ticks the last time - * we sent a RESEND for this RPC. - */ - __u32 resend_timer_ticks; - - /** - * @done_timer_ticks: The value of homa->timer_ticks the first - * time we noticed that this (server) RPC is done (all response - * packets have been transmitted), so we're ready for an ack. - * Zero means we haven't reached that point yet. - */ - __u32 done_timer_ticks; - - /** - * @magic: when the RPC is alive, this holds a distinct value that - * is unlikely to occur naturally. The value is cleared when the - * RPC is reaped, so we can detect accidental use of an RPC after - * it has been reaped. - */ -#define HOMA_RPC_MAGIC 0xdeadbeef - int magic; - - /** - * @start_cycles: time (from get_cycles()) when this RPC was created. - * Used (sometimes) for testing. - */ - uint64_t start_cycles; -}; - -/** - * homa_rpc_lock() - Acquire the lock for an RPC. - * @rpc: RPC to lock. Note: this function is only safe under - * limited conditions. The caller must ensure that the RPC - * cannot be reaped before the lock is acquired. It cannot - * do that by acquiring the socket lock, since that violates - * lock ordering constraints. One approach is to increment - * rpc->hsk->reap_disable. Don't use this function unless you - * are very sure what you are doing! See sync.txt for more - * info on locking. - */ -inline static void homa_rpc_lock(struct homa_rpc *rpc) { - if (!spin_trylock_bh(rpc->lock)) - homa_rpc_lock_slow(rpc); -} - -/** - * homa_rpc_unlock() - Release the lock for an RPC. - * @rpc: RPC to unlock. - */ -inline static void homa_rpc_unlock(struct homa_rpc *rpc) { - spin_unlock_bh(rpc->lock); -} - -/** - * homa_rpc_validate() - Check to see if an RPC has been reaped (which - * would mean it is no longer valid); if so, crash the kernel with a stack - * trace. - * @rpc: RPC to validate. - */ -inline static void homa_rpc_validate(struct homa_rpc *rpc) { - if (rpc->magic == HOMA_RPC_MAGIC) - return; - printk(KERN_ERR "Accessing reaped Homa RPC!\n"); - BUG(); -} - -/** - * define HOMA_SOCKTAB_BUCKETS - Number of hash buckets in a homa_socktab. - * Must be a power of 2. - */ -#define HOMA_SOCKTAB_BUCKETS 1024 - -/** - * struct homa_socktab - A hash table that maps from port numbers (either - * client or server) to homa_sock objects. - * - * This table is managed exclusively by homa_socktab.c, using RCU to - * minimize synchronization during lookups. - */ -struct homa_socktab { - /** - * @mutex: Controls all modifications to this object; not needed - * for socket lookups (RCU is used instead). Also used to - * synchronize port allocation. - */ - struct spinlock write_lock; - - /** - * @buckets: Heads of chains for hash table buckets. Chains - * consist of homa_socktab_link objects. - */ - struct hlist_head buckets[HOMA_SOCKTAB_BUCKETS]; -}; - -/** - * struct homa_socktab_links - Used to link homa_socks into the hash chains - * of a homa_socktab. - */ -struct homa_socktab_links { - /* Must be the first element of the struct! */ - struct hlist_node hash_links; - struct homa_sock *sock; -}; - -/** - * struct homa_socktab_scan - Records the state of an iteration over all - * the entries in a homa_socktab, in a way that permits RCU-safe deletion - * of entries. - */ -struct homa_socktab_scan { - /** @socktab: The table that is being scanned. */ - struct homa_socktab *socktab; - - /** - * @current_bucket: the index of the bucket in socktab->buckets - * currently being scanned. If >= HOMA_SOCKTAB_BUCKETS, the scan - * is complete. - */ - int current_bucket; - - /** - * @next: the next socket to return from homa_socktab_next (this - * socket has not yet been returned). NULL means there are no - * more sockets in the current bucket. - */ - struct homa_socktab_links *next; -}; - -/** - * define HOMA_CLIENT_RPC_BUCKETS - Number of buckets in hash tables for - * client RPCs. Must be a power of 2. - */ -#define HOMA_CLIENT_RPC_BUCKETS 1024 - -/** - * define HOMA_SERVER_RPC_BUCKETS - Number of buckets in hash tables for - * server RPCs. Must be a power of 2. - */ -#define HOMA_SERVER_RPC_BUCKETS 1024 - -struct homa_rpc_bucket { - /** - * @lock: serves as a lock both for this bucket (e.g., when - * adding and removing RPCs) and also for all of the RPCs in - * the bucket. Must be held whenever manipulating an RPC in - * this bucket. This dual purpose permits clean and safe - * deletion and garbage collection of RPCs. - */ - struct spinlock lock; - - /** @rpcs: list of RPCs that hash to this bucket. */ - struct hlist_head rpcs; -}; - -/** - * struct homa_bpage - Contains information about a single page in - * a buffer pool. Note: this information is stored in user memory, so - * it needs to be managed so that a misbehaving user program can't cause - * kernel crashes (it's OK if a misbehaving program causes the buffer pool - * to misbehave, such as running out of space, as long as it doesn't cause - * a kernel crash). - */ -struct homa_bpage { - union { - /** - * @cache_line: Ensures that each homa_bpage object - * is exactly one cache line long. - */ - struct homa_cache_line cache_line; - struct { - /** @lock: to synchronize shared access. Must never - * wait for this lock, since a faulty user program - * could leave it locked. - */ - struct spinlock lock; - - /** - * @refs: Number of messages with data in this page. - * The kernel increments this when allocating buffer - * space for a message, and the app decrements it when - * done with a message. - */ - atomic_t refs; - - /** - * @owner: kernel core that currently owns this page - * (< 0 if none). - */ - int owner; - - /** - * @expiration: time (in get_cycles units) after - * which it's OK to steal this page from its current - * owner. - */ - __u64 expiration; - }; - }; -}; -_Static_assert(sizeof(struct homa_bpage) == sizeof(struct homa_cache_line), - "homa_bpage overflowed a cache line"); - -/** - * struct homa_pool_core - Holds core-specific data for a homa_pool (a bpage - * out of which that core is allocating small chunks). - */ -struct homa_pool_core { - union { - /** - * @cache_line: Ensures that each object is exactly one - * cache line long. - */ - struct homa_cache_line cache_line; - struct { - /** - * @page_hint: Index of bpage in pool->descriptors, - * which may be owned by this core. If so, we'll use it - * for allocating partial pages. - */ - int page_hint; - - /** - * @allocated: if the page given by @page_hint is - * owned by this core, this variable gives the number of - * (initial) bytes that have already been allocated - * from the page. - */ - int allocated; - }; - }; -}; -_Static_assert(sizeof(struct homa_pool_core) == sizeof(struct homa_cache_line), - "homa_pool_core overflowed a cache line"); - -/** - * struct homa_pool - Describes a pool of buffer space for incoming - * messages for a particular socket; managed by homa_pool.c. The pool is - * divided up into "bpages", which are a multiple of the hardware page size. - * A bpage may be owned by a particular core so that it can more efficiently - * allocate space for small messages. - */ -struct homa_pool { - /** - * @region: beginning of the pool's region (in the app's virtual - * memory). Initial portion is used for bpage metadata shared - * with the application, and the remainder is divided into pages. - * 0 means the pool hasn't yet been initialized. - */ - char *region; - - /** @num_bpages: total number of bpages in the pool. */ - int num_bpages; - - /** - * @homa: shared information about the Homa driver. - */ - struct homa *homa; - - /** @descriptors: kmalloced area containing one entry for each bpage. */ - struct homa_bpage *descriptors; - - /** - * @active_pages: the number of bpages (always the lowest ones) - * that are currently being used for allocation. Varies slowly - * depending on active buffer usage. The goal is to keep this - * number small to minimize memory footprint, while keeping it - * large enough so that many pages are free at any given time - * (so allocation is efficient). - */ - atomic_t active_pages; - - /** - * @next_scan: index of the next page to check while searching for - * a free bpage. - */ - atomic_t next_scan; - - /** - * @free_bpages_found: the number of pages successfully allocated - * so far in the current scan (i.e. since @next_scan was set to 0). - */ - atomic_t free_bpages_found; - - /** @cores: core-specific info; dynamically allocated. */ - struct homa_pool_core *cores; - - /** @num_cores: number of elements in @cores. */ - int num_cores; -}; - -/** - * struct homa_sock - Information about an open socket. - */ -struct homa_sock { - /* Info for other network layers. Note: IPv6 info (struct ipv6_pinfo - * comes at the very end of the struct, *after* Homa's data, if this - * socket uses IPv6). - */ - union { - /** @sock: generic socket data; must be the first field. */ - struct sock sock; - - /** - * @inet: generic Internet socket data; must also be the - first field (contains sock as its first member). - */ - struct inet_sock inet; - }; - - /** - * @lock: Must be held when modifying fields such as interests - * and lists of RPCs. This lock is used in place of sk->sk_lock - * because it's used differently (it's always used as a simple - * spin lock). See sync.txt for more on Homa's synchronization - * strategy. - */ - struct spinlock lock; - - /** - * @last_locker: identifies the code that most recently acquired - * @lock successfully. Occasionally used for debugging. */ - char *last_locker; - - /** - * @protect_count: counts the number of calls to homa_protect_rpcs - * for which there have not yet been calls to homa_unprotect_rpcs. - * See sync.txt for more info. - */ - atomic_t protect_count; - - /** - * @homa: Overall state about the Homa implementation. NULL - * means this socket has been deleted. - */ - struct homa *homa; - - /** @shutdown: True means the socket is no longer usable. */ - bool shutdown; - - /** - * @port: Port number: identifies this socket uniquely among all - * those on this node. - */ - __u16 port; - - /** - * @ip_header_length: Length of IP headers for this socket (depends - * on IPv4 vs. IPv6). - */ - int ip_header_length; - - /** - * @client_socktab_links: Links this socket into the homa_socktab - * based on @port. - */ - struct homa_socktab_links socktab_links; - - /** - * @active_rpcs: List of all existing RPCs related to this socket, - * including both client and server RPCs. This list isn't strictly - * needed, since RPCs are already in one of the hash tables below, - * but it's more efficient for homa_timer to have this list - * (so it doesn't have to scan large numbers of hash buckets). - * The list is sorted, with the oldest RPC first. Manipulate with - * RCU so timer can access without locking. - */ - struct list_head active_rpcs; - - /** - * @dead_rpcs: Contains RPCs for which homa_rpc_free has been - * called, but their packet buffers haven't yet been freed. - */ - struct list_head dead_rpcs; - - /** @dead_skbs: Total number of socket buffers in RPCs on dead_rpcs. */ - int dead_skbs; - - /** - * @ready_requests: Contains server RPCs whose request message is - * in a state requiring attention from a user process. The head is - * oldest, i.e. next to return. - */ - struct list_head ready_requests; - - /** - * @ready_responses: Contains client RPCs whose response message is - * in a state requiring attention from a user process. The head is - * oldest, i.e. next to return. - */ - struct list_head ready_responses; - - /** - * @request_interests: List of threads that want to receive incoming - * request messages. - */ - struct list_head request_interests; - - /** - * @response_interests: List of threads that want to receive incoming - * response messages. - */ - struct list_head response_interests; - - /** - * @client_rpc_buckets: Hash table for fast lookup of client RPCs. - * Modifications are synchronized with bucket locks, not - * the socket lock. - */ - struct homa_rpc_bucket client_rpc_buckets[HOMA_CLIENT_RPC_BUCKETS]; - - /** - * @server_rpc_buckets: Hash table for fast lookup of server RPCs. - * Modifications are synchronized with bucket locks, not - * the socket lock. - */ - struct homa_rpc_bucket server_rpc_buckets[HOMA_SERVER_RPC_BUCKETS]; - - /** - * @buffer_pool: used to allocate buffer space for incoming messages. - */ - struct homa_pool buffer_pool; -}; - -/** - * struct homa_dead_dst - Used to retain dst_entries that are no longer - * needed, until it is safe to delete them (I'm not confident that the RCU - * mechanism will be safe for these: the reference count could get incremented - * after it's on the RCU list?). - */ -struct homa_dead_dst { - /** @dst: Entry that is no longer used by a struct homa_peer. */ - struct dst_entry *dst; - - /** - * @gc_time: Time (in units of get_cycles) when it is safe - * to free @dst. - */ - __u64 gc_time; - - /** @dst_links: Used to link together entries in peertab->dead_dsts. */ - struct list_head dst_links; -}; - -/** - * define HOMA_PEERTAB_BUCKETS - Number of bits in the bucket index for a - * homa_peertab. Should be large enough to hold an entry for every server - * in a datacenter without long hash chains. - */ -#define HOMA_PEERTAB_BUCKET_BITS 20 - -/** define HOME_PEERTAB_BUCKETS - Number of buckets in a homa_peertab. */ -#define HOMA_PEERTAB_BUCKETS (1 << HOMA_PEERTAB_BUCKET_BITS) - -/** - * struct homa_peertab - A hash table that maps from IPv6 addresses - * to homa_peer objects. IPv4 entries are encapsulated as IPv6 addresses. - * Entries are gradually added to this table, but they are never removed - * except when the entire table is deleted. We can't safely delete because - * results returned by homa_peer_find may be retained indefinitely. - * - * This table is managed exclusively by homa_peertab.c, using RCU to - * permit efficient lookups. - */ -struct homa_peertab { - /** - * @write_lock: Synchronizes addition of new entries; not needed - * for lookups (RCU is used instead). - */ - struct spinlock write_lock; - - /** - * @dead_dsts: List of dst_entries that are waiting to be deleted. - * Hold @write_lock when manipulating. - */ - struct list_head dead_dsts; - - /** - * @buckets: Pointer to heads of chains of homa_peers for each bucket. - * Malloc-ed, and must eventually be freed. NULL means this structure - * has not been initialized. - */ - struct hlist_head *buckets; -}; - -/** - * struct homa_peer - One of these objects exists for each machine that we - * have communicated with (either as client or server). - */ -struct homa_peer { - /** - * @addr: IPv6 address for the machine (IPv4 addresses are stored - * as IPv4-mapped IPv6 addresses). - */ - struct in6_addr addr; - - /** @flow: Addressing info needed to send packets. */ - struct flowi flow; - - /** - * @dst: Used to route packets to this peer; we own a reference - * to this, which we must eventually release. - */ - struct dst_entry *dst; - - /** - * @unsched_cutoffs: priorities to use for unscheduled packets - * sent to this host, as specified in the most recent CUTOFFS - * packet from that host. See documentation for @homa.unsched_cutoffs - * for the meanings of these values. - */ - int unsched_cutoffs[HOMA_MAX_PRIORITIES]; - - /** - * @cutoff_version: value of cutoff_version in the most recent - * CUTOFFS packet received from this peer. 0 means we haven't - * yet received a CUTOFFS packet from the host. Note that this is - * stored in network byte order. - */ - __be16 cutoff_version; - - /** - * last_update_jiffies: time in jiffies when we sent the most - * recent CUTOFFS packet to this peer. - */ - unsigned long last_update_jiffies; - - /** - * grantable_rpcs: Contains all homa_rpcs (both requests and - * responses) involving this peer whose msgins require (or required - * them in the past) and have not been fully received. The list is - * sorted in priority order (head has fewest bytes_remaining). - * Locked with homa->grantable_lock. - */ - struct list_head grantable_rpcs; - - /** - * @grantable_links: Used to link this peer into homa->grantable_peers, - * if there are entries in grantable_rpcs. If grantable_rpcs is empty, - * this is an empty list pointing to itself. - */ - struct list_head grantable_links; - - /** - * @peertab_links: Links this object into a bucket of its - * homa_peertab. - */ - struct hlist_node peertab_links; - - /** - * @outstanding_resends: the number of resend requests we have - * sent to this server (spaced @homa.resend_interval apart) since - * we received a packet from this peer. - */ - int outstanding_resends; - - /** - * @most_recent_resend: @homa->timer_ticks when the most recent - * resend was sent to this peer. - */ - int most_recent_resend; - - /** - * @least_recent_rpc: of all the RPCs for this peer scanned at - * @current_ticks, this is the RPC whose @resend_timer_ticks - * is farthest in the past. - */ - struct homa_rpc *least_recent_rpc; - - /** - * @least_recent_ticks: the @resend_timer_ticks value for - * @least_recent_rpc. - */ - __u32 least_recent_ticks; - - /** - * @current_ticks: the value of @homa->timer_ticks the last time - * that @least_recent_rpc and @least_recent_ticks were computed. - * Used to detect the start of a new homa_timer pass. - */ - __u32 current_ticks; - - /** - * @resend_rpc: the value of @least_recent_rpc computed in the - * previous homa_timer pass. This RPC will be issued a RESEND - * in the current pass, if it still needs one. - */ - struct homa_rpc *resend_rpc; - - /** - * @num_acks: the number of (initial) entries in @acks that - * currently hold valid information. - */ - int num_acks; - - /** - * @acks: info about client RPCs whose results have been completely - * received. - */ - struct homa_ack acks[NUM_PEER_UNACKED_IDS]; - - /** - * @ack_lock: used to synchronize access to @num_acks and @acks. - */ - struct spinlock ack_lock; -}; - -/** - * enum homa_freeze_type - The @type argument to homa_freeze must be - * one of these values. - */ -enum homa_freeze_type { - RESTART_RPC = 1, - PEER_TIMEOUT = 2, - SLOW_RPC = 3, - SOCKET_CLOSE = 4, - PACKET_LOST = 5, -}; - -/** - * struct homa - Overall information about the Homa protocol implementation. - * - * There will typically only exist one of these at a time, except during - * unit tests. - */ -struct homa { - /** - * @next_outgoing_id: Id to use for next outgoing RPC request. - * This is always even: it's used only to generate client-side ids. - * Accessed without locks. - */ - atomic64_t next_outgoing_id; - - /** - * @link_idle_time: The time, measured by get_cycles() at which we - * estimate that all of the packets we have passed to Linux for - * transmission will have been transmitted. May be in the past. - * This estimate assumes that only Homa is transmitting data, so - * it could be a severe underestimate if there is competing traffic - * from, say, TCP. Access only with atomic ops. - */ - atomic64_t link_idle_time __attribute__((aligned(CACHE_LINE_SIZE))); - - /** - * @grantable_lock: Used to synchronize access to @grantable_peers and - * @num_grantable_peers. - */ - struct spinlock grantable_lock __attribute__((aligned(CACHE_LINE_SIZE))); - - /** - * @grantable_peers: Contains all homa_peers for which there are - * RPCs that have not been fully granted. The list is sorted in - * priority order (the rpc with the fewest bytes_remaining is the - * first one on the first peer's list). - */ - struct list_head grantable_peers; - - /** @num_grantable_peers: The number of peers in grantable_peers. */ - int num_grantable_peers; - - /** - * @grant_nonfifo: How many bytes should be granted using the - * normal priority system between grants to the oldest message. - */ - int grant_nonfifo; - - /** - * @grant_nonfifo_left: Counts down bytes using the normal - * priority mechanism. When this reaches zero, it's time to grant - * to the old message. - */ - int grant_nonfifo_left; - - /** - * @pacer_mutex: Ensures that only one instance of homa_pacer_xmit - * runs at a time. Only used in "try" mode: never block on this. - */ - struct spinlock pacer_mutex __attribute__((aligned(CACHE_LINE_SIZE))); - - /** - * @pacer_fifo_fraction: The fraction of time (in thousandths) when - * the pacer should transmit next from the oldest message, rather - * than the highest-priority message. Set externally via sysctl. - */ - int pacer_fifo_fraction; - - /** - * @pacer_fifo_count: When this becomes <= zero, it's time for the - * pacer to allow the oldest RPC to transmit. - */ - int pacer_fifo_count; - - /** - * @pacer_start: get_cycles() time when the pacer last woke up - * (if the pacer is running) or 0 if the pacer is sleeping. - */ - __u64 pacer_wake_time; - - /** - * @throttle_lock: Used to synchronize access to @throttled_rpcs. To - * insert or remove an RPC from throttled_rpcs, must first acquire - * the RPC's socket lock, then this lock. - */ - struct spinlock throttle_lock; - - /** - * @throttled_rpcs: Contains all homa_rpcs that have bytes ready - * for transmission, but which couldn't be sent without exceeding - * the queue limits for transmission. Manipulate only with "_rcu" - * functions. - */ - struct list_head throttled_rpcs; - - /** - * @throttle_add: The get_cycles() time when the most recent RPC - * was added to @throttled_rpcs. - */ - __u64 throttle_add; - - /** - * @throttle_min_bytes: If a packet has fewer bytes than this, then it - * bypasses the throttle mechanism and is transmitted immediately. - * We have this limit because for very small packets we can't keep - * up with the NIC (we're limited by CPU overheads); there's no - * need for throttling and going through the throttle mechanism - * adds overhead, which slows things down. At least, that's the - * hypothesis (needs to be verified experimentally!). Set externally - * via sysctl. - */ - int throttle_min_bytes; - - /** - * @total_incoming: the total number of bytes that we expect to receive - * (across all messages) even if we don't send out any more grants - * (includes granted but unreceived bytes, plus unreceived unscheduled - * bytes that we know about). This can potentially be negative, if - * a peer sends more bytes than granted (see synchronization note in - * homa_send_grants for why we have to allow this possibility). - */ - atomic_t total_incoming __attribute__((aligned(CACHE_LINE_SIZE))); - - /** - * @next_client_port: A client port number to consider for the - * next Homa socket; increments monotonically. Current value may - * be in the range allocated for servers; must check before using. - * This port may also be in use already; must check. - */ - __u16 next_client_port __attribute__((aligned(CACHE_LINE_SIZE))); - - /** - * @port_map: Information about all open sockets. - */ - struct homa_socktab port_map __attribute__((aligned(CACHE_LINE_SIZE))); - - /** - * @peertab: Info about all the other hosts we have communicated with. - */ - struct homa_peertab peers; - - /** - * @rtt_bytes: An estimate of the amount of data that can be transmitted - * over the wire in the time it takes to send a full-size data packet - * and receive back a grant. Used to ensure full utilization of - * uplink bandwidth. Set externally via sysctl. - */ - int rtt_bytes; - - /** - * @max_grant_window: if nonzero, determines the maximum number - * of granted-but-not-yet-received bytes for a message (may be - * greater than rtt_bytes). This feature is currently for - * experimentation only. Set externally via sysctl.*/ - int max_grant_window; - - /** - * @link_bandwidth: The raw bandwidth of the network uplink, in - * units of 1e06 bits per second. Set externally via sysctl. - */ - int link_mbps; - - /** - * @poll_usecs: Amount of time (in microseconds) that a thread - * will spend busy-waiting for an incoming messages before - * going to sleep. Set externally via sysctl. - */ - int poll_usecs; - - /** - * @poll_cycles: The value of @poll_usecs in the units returned - * by get_cycles(). - */ - int poll_cycles; - - /** - * @num_priorities: The total number of priority levels available for - * Homa's use. Internally, Homa will use priorities from 0 to - * num_priorities-1, inclusive. Set externally via sysctl. - */ - int num_priorities; - - /** - * @priority_map: entry i gives the value to store in the high-order - * 3 bits of the DSCP field of IP headers to implement priority level - * i. Set externally via sysctl. - */ - int priority_map[HOMA_MAX_PRIORITIES]; - - /** - * @max_sched_prio: The highest priority level currently available for - * scheduled packets. Levels above this are reserved for unscheduled - * packets. Set externally via sysctl. - */ - int max_sched_prio; - - /** - * @unsched_cutoffs: the current priority assignments for incoming - * unscheduled packets. The value of entry i is the largest - * message size that uses priority i (larger i is higher priority). - * If entry i has a value of HOMA_MAX_MESSAGE_SIZE or greater, then - * priority levels less than i will not be used for unscheduled - * packets. At least one entry in the array must have a value of - * HOMA_MAX_MESSAGE_SIZE or greater (entry 0 is usually INT_MAX). - * Set externally via sysctl. - */ - int unsched_cutoffs[HOMA_MAX_PRIORITIES]; - - /** - * @cutoff_version: increments every time unsched_cutoffs is - * modified. Used to determine when we need to send updates to - * peers. Note: 16 bits should be fine for this: the worst - * that happens is a peer has a super-stale value that equals - * our current value, so the peer uses suboptimal cutoffs until the - * next version change. Can be set externally via sysctl. - */ - int cutoff_version; - - /** - * @fifo_grant_increment: how many additional bytes to grant in - * a "pity" grant sent to the oldest outstanding message. Set - * externally via sysctl. - */ - int fifo_grant_increment; - - /** - * @grant_fifo_fraction: The fraction (in thousandths) of granted - * bytes that should go to the *oldest* incoming message, rather - * than the highest priority ones. Set externally via sysctl. - */ - int grant_fifo_fraction; - - /** - * @duty_cycle: Sets a limit on the fraction of network bandwidth that - * may be consumed by a single RPC in units of one-thousandth (1000 - * means a single RPC can consume all of the incoming network - * bandwidth, 500 means half, and so on). This also determines the - * fraction of a core that can be consumed by NAPI when a large - * message is being received. Its main purpose is to keep NAPI from - * monopolizing a core so much that user threads starve. Set externally - * via sysctl. - */ - int duty_cycle; - - /** - * @grant_threshold: A grant will not be sent for an RPC until - * the number of incoming bytes drops below this threshold. Computed - * from @rtt_bytes and @duty_cycle. - */ - int grant_threshold; - - /** - * @max_overcommit: The maximum number of messages to which Homa will - * send grants at any given point in time. Set externally via sysctl. - */ - int max_overcommit; - - /** - * @max_incoming: This value is computed from max_overcommit, and - * is the limit on how many bytes are currently permitted to be - * granted but not yet received, cumulative across all messages. - */ - int max_incoming; - - /** - * @resend_ticks: When an RPC's @silent_ticks reaches this value, - * start sending RESEND requests. - */ - int resend_ticks; - - /** - * @resend_interval: minimum number of homa timer ticks between - * RESENDs to the same peer. - */ - int resend_interval; - - /** - * @timeout_resends: Assume that a server is dead if it has not - * responded after this many RESENDs have been sent to it. - */ - int timeout_resends; - - /** - * @request_ack_ticks: How many timer ticks we'll wait for the - * client to ack an RPC before explicitly requesting an ack. - * Set externally via sysctl. - */ - int request_ack_ticks; - - /** - * @reap_limit: Maximum number of packet buffers to free in a - * single call to home_rpc_reap. - */ - int reap_limit; - - /** - * @dead_buffs_limit: If the number of packet buffers in dead but - * not yet reaped RPCs is less than this number, then Homa reaps - * RPCs in a way that minimizes impact on performance but may permit - * dead RPCs to accumulate. If the number of dead packet buffers - * exceeds this value, then Homa switches to a more aggressive approach - * to reaping RPCs. Set externally via sysctl. - */ - int dead_buffs_limit; - - /** - * @max_dead_buffs: The largest aggregate number of packet buffers - * in dead (but not yet reaped) RPCs that has existed so far in a - * single socket. Readable via sysctl, and may be reset via sysctl - * to begin recalculating. - */ - int max_dead_buffs; - - /** - * @pacer_kthread: Kernel thread that transmits packets from - * throttled_rpcs in a way that limits queue buildup in the - * NIC. - */ - struct task_struct *pacer_kthread; - - /** - * @pacer_exit: true means that the pacer thread should exit as - * soon as possible. - */ - bool pacer_exit; - - /** - * @max_nic_queue_ns: Limits the NIC queue length: we won't queue - * up a packet for transmission if link_idle_time is this many - * nanoseconds in the future (or more). Set externally via sysctl. - */ - int max_nic_queue_ns; - - /** - * @max_nic_queue_cycles: Same as max_nic_queue_ns, except in units - * of get_cycles(). - */ - int max_nic_queue_cycles; - - /** - * @cycles_per_kbyte: the number of cycles, as measured by get_cycles(), - * that it takes to transmit 1000 bytes on our uplink. This is actually - * a slight overestimate of the value, to ensure that we don't - * underestimate NIC queue length and queue too many packets. - */ - __u32 cycles_per_kbyte; - - /** - * @verbose: Nonzero enables additional logging. Set externally via - * sysctl. - */ - int verbose; - - /** - * @max_gso_size: Maximum number of bytes that will be included - * in a single output packet that Homa passes to Linux. Can be set - * externally via sysctl to lower the limit already enforced by Linux. - */ - int max_gso_size; - - /** - * @max_gro_skbs: Maximum number of socket buffers that can be - * aggregated by the GRO mechanism. Set externally via sysctl. - */ - int max_gro_skbs; - - /** - * @gso_force_software: A non-zero value will cause Home to perform - * segmentation in software using GSO; zero means ask the NIC to - * perform TSO. Set externally via sysctl. - */ - int gso_force_software; - - /** - * @gro_policy: An OR'ed together collection of bits that determine - * how Homa packets should be steered for SoftIRQ handling. A value - * of zero will eliminate any Homa-specific behaviors, reverting - * to the Linux defaults. Set externally via sysctl (but modifying - * it is almost certainly a bad idea; see below). - */ - int gro_policy; - - /* Bits that can be specified for gro_policy. These were created for - * testing, in order to evaluate various possible policies; you almost - * certainly should not use any value other than HOMA_GRO_NORMAL. - * HOMA_GRO_BYPASS: Pass all incoming packets directly to - * homa_softirq during GRO; this bypasses - * the SoftIRQ dispatching mechanism as well - * as the network and IP stack layers. - * HOMA_GRO_SAME_CORE If isolated packets arrive (not part of - * a batch) use the GRO core for SoftIRQ also. - * HOMA_GRO_IDLE Use old mechanism for selecting an idle - * core for SoftIRQ (deprecated). - * HOMA_GRO_NEXT Always use the next core in circular - * order for SoftIRQ (deprecated). - * HOMA_GRO_IDLE_NEW Use the new mechanism for selecting an - * idle core for SoftIRQ. - * HOMA_GRO_FAST_GRANTS Pass all grant I can see immediately to - * homa_softirq during GRO. - * HOMA_GRO_SHORT_BYPASS Pass all short packets directly to - * homa_softirq during GR). - */ - #define HOMA_GRO_BYPASS 1 - #define HOMA_GRO_SAME_CORE 2 - #define HOMA_GRO_IDLE 4 - #define HOMA_GRO_NEXT 8 - #define HOMA_GRO_IDLE_NEW 16 - #define HOMA_GRO_FAST_GRANTS 32 - #define HOMA_GRO_SHORT_BYPASS 64 - #define HOMA_GRO_NORMAL (HOMA_GRO_SAME_CORE|HOMA_GRO_IDLE_NEW \ - |HOMA_GRO_SHORT_BYPASS) - - /* - * @gro_busy_usecs: try not to schedule SoftIRQ processing on a core - * if it has handled Homa packets at GRO level in the last - * gro_busy_us microseconds (improve load balancing by avoiding - * hot spots). Set externally via sysctl. - */ - int gro_busy_usecs; - - /** - * @gro_busy_cycles: Same as gro_busy_usecs, except in units - * of get_cycles(). - */ - int gro_busy_cycles; - - /** - * @timer_ticks: number of times that homa_timer has been invoked - * (may wraparound, which is safe). - */ - __u32 timer_ticks; - - /** - * @metrics_lock: Used to synchronize accesses to @metrics_active_opens - * and updates to @metrics. - */ - struct spinlock metrics_lock; - - /* - * @metrics: a human-readable string containing recent values - * for all the Homa performance metrics, as generated by - * homa_append_metric. This string is kmalloc-ed; NULL means - * homa_append_metric has never been called. - */ - char* metrics; - - /** @metrics_capacity: number of bytes available at metrics. */ - size_t metrics_capacity; - - /** - * @metrics_length: current length of the string in metrics, - * not including terminating NULL character. - */ - size_t metrics_length; - - /** - * @metrics_active_opens: number of open struct files that - * currently exist for the metrics file in /proc. - */ - int metrics_active_opens; - - /** - * @flags: a collection of bits that can be set using sysctl - * to trigger various behaviors. - */ - int flags; - - /** - * @freeze_type: determines conditions under which the time trace - * should be frozen. Set externally via sysctl. - */ - enum homa_freeze_type freeze_type; - - /** - * @sync_freeze: nonzero means that on completion of the next - * client RPC we should freeze our timetrace and also the peer's. - * Then clear this back to zero again. Set externally via sysctl. - */ - int sync_freeze; - - /** - * @bpage_lease_usecs: how long a core can own a bpage (microseconds) - * before its ownership can be revoked to reclaim the page. - */ - int bpage_lease_usecs; - - /** - * @bpage_lease_cycles: The value of @bpage_lease_usecs in get_cycles - * units. - */ - int bpage_lease_cycles; - - /** - * @temp: the values in this array can be read and written with sysctl. - * They have no officially defined purpose, and are available for - * short-term use during testing. - */ - int temp[4]; -}; - -/** - * struct homa_metrics - various performance counters kept by Homa. - * - * There is one of these structures for each core, so counters can - * be updated without worrying about synchronization or extra cache - * misses. This isn't quite perfect (it's conceivable that a process - * could move from one CPU to another in the middle of updating a counter), - * but this is unlikely, and we can tolerate the occasional miscounts - * that might result. - * - * All counters are free-running: they never reset. - */ -#define HOMA_NUM_SMALL_COUNTS 64 -#define HOMA_NUM_MEDIUM_COUNTS 128 -struct homa_metrics { - /** - * @small_msg_bytes: entry i holds the total number of bytes - * received in messages whose length is between 64*i and 64*i + 63, - * inclusive. - */ - __u64 small_msg_bytes[HOMA_NUM_SMALL_COUNTS]; - - /** - * @medium_msg_bytes: entry i holds the total number of bytes - * received in messages whose length is between 1024*i and - * 1024*i + 1023, inclusive. The first four entries are always 0 - * (small_msg_counts covers this range). - */ - __u64 medium_msg_bytes[HOMA_NUM_MEDIUM_COUNTS]; - - /** - * @large_msg_count: the total number of messages received whose - * length is too large to appear in medium_msg_bytes. - */ - __u64 large_msg_count; - - /** - * @large_msg_bytes: the total number of bytes received in - * messages too large to be counted by medium_msg_bytes. - */ - __u64 large_msg_bytes; - - /** - * @sent_msg_bytes: The total number of bytes in outbound - * messages. - */ - __u64 sent_msg_bytes; - - /** - * @packets_sent: total number of packets sent for each packet type - * (entry 0 corresponds to DATA, and so on). - */ - __u64 packets_sent[BOGUS-DATA]; - - /** - * @packets_received: total number of packets received for each - * packet type (entry 0 corresponds to DATA, and so on). - */ - __u64 packets_received[BOGUS-DATA]; - - /** @priority_bytes: total bytes sent at each priority level. */ - __u64 priority_bytes[HOMA_MAX_PRIORITIES]; - - /** @priority_packets: total packets sent at each priority level. */ - __u64 priority_packets[HOMA_MAX_PRIORITIES]; - - /** - * @requests_received: total number of request messages received. - */ - __u64 requests_received; - - /** - * @requests_queued: total number of requests that were added to - * @homa->ready_requests (no thread was waiting). - */ - __u64 requests_queued; - - /** - * @responses_received: total number of response messages received. - */ - __u64 responses_received; - - /** - * @responses_queued: total number of responses that were added to - * @homa->ready_responses (no thread was waiting). - */ - __u64 responses_queued; - - /** - * @fast_wakeups: total number of times that a message arrived for - * a receiving thread that was polling in homa_wait_for_message. - */ - __u64 fast_wakeups; - - /** - * @slow_wakeups: total number of times that a receiving thread - * had to be put to sleep (no message arrived while it was polling). - */ - __u64 slow_wakeups; - - /** - * @poll_cycles: total time spent in the polling loop in - * homa_wait_for_message, as measured with get_cycles(). - */ - __u64 poll_cycles; - - /** - * @softirq_calls: total number of calls to homa_softirq (i.e., - * total number of GRO packets processed, each of which could contain - * multiple Homa packets. - */ - __u64 softirq_calls; - - /** - * @softirq_cycles: total time spent executing homa_softirq when - * invoked under Linux's SoftIRQ handler, as measured with get_cycles(). - */ - __u64 softirq_cycles; - - /** - * @bypass_softirq_cycles: total time spent executing homa_softirq when - * invoked during GRO, bypassing the SoftIRQ mechanism. - */ - __u64 bypass_softirq_cycles; - - /** - * @linux_softirq_cycles: total time spent executing all softirq - * activities, as measured by the linux softirq module, in get_cycles() - * units. Only available with modified Linux kernels. - */ - __u64 linux_softirq_cycles; - - /** - * @napi_cycles: total time spent executing all NAPI activities, - * as measured by the linux softirq module, in get_cycles() units. - * Only available with modified Linux kernels. - */ - __u64 napi_cycles; - - /** - * @send_cycles: total time spent executing the homa_ioc_send - * kernel call handler, as measured with get_cycles(). - */ - __u64 send_cycles; - - /** @send_calls: total number of invocations of the send kernel call. */ - __u64 send_calls; - - /** - * @recv_cycles: total time spent executing homa_recvmsg (including - * time when the thread is blocked), as measured with get_cycles(). - */ - __u64 recv_cycles; - - /** @recv_calls: total number of invocations of homa_recvmsg. */ - __u64 recv_calls; - - /** - * @blocked_cycles: total time threads spend in blocked state - * while executing the homa_recvmsg kernel call handler. - */ - __u64 blocked_cycles; - - /** - * @reply_cycles: total time spent executing the homa_ioc_reply - * kernel call handler, as measured with get_cycles(). - */ - __u64 reply_cycles; - - /** @reply_calls: total number of invocations of the reply kernel call. */ - __u64 reply_calls; - - /** - * @abort_cycles: total time spent executing the homa_ioc_abort - * kernel call handler, as measured with get_cycles(). - */ - __u64 abort_cycles; - - /** - * @abort_calls: total number of invocations of the homa_ioc_abort - * kernel call. - */ - __u64 abort_calls; - - /** - * @so_set_buf_cycles: total time spent executing the homa_ioc_set_buf - * kernel call handler, as measured with get_cycles(). - */ - __u64 so_set_buf_cycles; - - /** - * @so_set_buf_calls: total number of invocations of the homa_ioc_set_buf - * kernel call. - */ - __u64 so_set_buf_calls; - - /** - * @grant_cycles: total time spent in homa_send_grants, as measured - * with get_cycles(). - */ - __u64 grant_cycles; - - /** - * @timer_cycles: total time spent in homa_timer, as measured with - * get_cycles(). - */ - __u64 timer_cycles; - - /** - * @timer_reap_cycles: total time spent by homa_timer to reap dead - * RPCs, as measured with get_cycles(). This time is included in - * @timer_cycles. - */ - __u64 timer_reap_cycles; - - /** - * @data_pkt_reap_cycles: total time spent by homa_data_pkt to reap - * dead RPCs, as measured with get_cycles(). - */ - __u64 data_pkt_reap_cycles; - - /** - * @pacer_cycles: total time spent executing in homa_pacer_main - * (not including blocked time), as measured with get_cycles(). - */ - __u64 pacer_cycles; - - /** - * @pacer_lost_cycles: unnecessary delays in transmitting packets - * (i.e. wasted output bandwidth) because the pacer was slow or got - * descheduled. - */ - __u64 pacer_lost_cycles; - - /** - * @pacer_bytes: total number of bytes transmitted when - * @homa->throttled_rpcs is nonempty. - */ - __u64 pacer_bytes; - - /** - * @pacer_skipped_rpcs: total number of times that the pacer had to - * abort because it couldn't lock an RPC. - */ - __u64 pacer_skipped_rpcs; - - /** - * @pacer_needed_help: total number of times that homa_check_pacer - * found that the pacer was running behind, so it actually invoked - * homa_pacer_xmit. - */ - __u64 pacer_needed_help; - - /** - * @throttled_cycles: total amount of time that @homa->throttled_rpcs - * is nonempty, as measured with get_cycles(). - */ - __u64 throttled_cycles; - - /** - * @resent_packets: total number of data packets issued in response to - * RESEND packets. - */ - __u64 resent_packets; - - /** - * @peer_hash_links: total # of link traversals in homa_peer_find. - */ - __u64 peer_hash_links; - - /** - * @peer_new_entries: total # of new entries created in Homa's - * peer table (this value doesn't increment if the desired peer is - * found in the entry in its hash chain). - */ - __u64 peer_new_entries; - - /** - * @peer_kmalloc errors: total number of times homa_peer_find - * returned an error because it couldn't allocate memory for a new - * peer. - */ - __u64 peer_kmalloc_errors; - - /** - * @peer_route errors: total number of times homa_peer_find - * returned an error because it couldn't create a route to the peer. - */ - __u64 peer_route_errors; - - /** - * @control_xmit_errors errors: total number of times ip_queue_xmit - * failed when transmitting a control packet. - */ - __u64 control_xmit_errors; - - /** - * @data_xmit_errors errors: total number of times ip_queue_xmit - * failed when transmitting a data packet. - */ - __u64 data_xmit_errors; - - /** - * @unknown_rpc: total number of times an incoming packet was - * discarded because it referred to a nonexistent RPC. Doesn't - * count grant packets received by servers (since these are - * fairly common). - */ - __u64 unknown_rpcs; - - /** - * @cant_create_server_rpc: total number of times a server discarded - * an incoming packet because it couldn't create a homa_rpc object. - */ - __u64 server_cant_create_rpcs; - - /** - * @unknown_packet_type: total number of times a packet was discarded - * because its type wasn't one of the supported values. - */ - __u64 unknown_packet_types; - - /** - * @short_packets: total number of times a packet was discarded - * because it was too short to hold all the required information. - */ - __u64 short_packets; - - /** - * @redundant_packets: total number of times a packet was discarded - * because all of its they had already been received (perhaps a - * resent packet that turned out to be unnecessary?). - */ - __u64 redundant_packets; - - /** - * @resent_packets_used: total number of times a resent packet was - * actually incorporated into the message at the target (i.e. it - * wasn't redundant). - */ - __u64 resent_packets_used; - - /** - * @peer_timeouts: total number of times a peer (either client or - * server) was found to be nonresponsive, resulting in RPC aborts. - */ - __u64 peer_timeouts; - - /** - * @server_rpc_discards: total number of times an RPC was aborted on - * the server side because of a timeout. - */ - __u64 server_rpc_discards; - - /** - * @server_rpcs_unknown: total number of times an RPC was aborted on - * the server side because it is no longer known to the client. - */ - __u64 server_rpcs_unknown; - - /** - * @client_lock_misses: total number of times that Homa had to wait - * to acquire a client bucket lock. - */ - __u64 client_lock_misses; - - /** - * @client_lock_miss_cycles: total time spent waiting for client - * bucket lock misses, measured by get_cycles(). - */ - __u64 client_lock_miss_cycles; - - /** - * @server_lock_misses: total number of times that Homa had to wait - * to acquire a server bucket lock. - */ - __u64 server_lock_misses; - - /** - * @server_lock_miss_cycles: total time spent waiting for server - * bucket lock misses, measured by get_cycles(). - */ - __u64 server_lock_miss_cycles; - - /** - * @socket_lock_miss_cycles: total time spent waiting for socket - * lock misses, measured by get_cycles(). - */ - __u64 socket_lock_miss_cycles; - - /** - * @socket_lock_misses: total number of times that Homa had to wait - * to acquire a socket lock. - */ - __u64 socket_lock_misses; - - /** - * @throttle_lock_miss_cycles: total time spent waiting for throttle - * lock misses, measured by get_cycles(). - */ - __u64 throttle_lock_miss_cycles; - - /** - * @throttle_lock_misses: total number of times that Homa had to wait - * to acquire the throttle lock. - */ - __u64 throttle_lock_misses; - - /** - * @grantable_lock_miss_cycles: total time spent waiting for grantable - * lock misses, measured by get_cycles(). - */ - __u64 grantable_lock_miss_cycles; - - /** - * @grantable_lock_misses: total number of times that Homa had to wait - * to acquire the grantable lock. - */ - __u64 grantable_lock_misses; - - /** - * @peer_acklock_miss_cycles: total time spent waiting for peer - * lock misses, measured by get_cycles(). - */ - __u64 peer_ack_lock_miss_cycles; - - /** - * @peer_ack_lock_misses: total number of times that Homa had to wait - * to acquire the lock used for managing acks for a peer. - */ - __u64 peer_ack_lock_misses; - - /** - * @disabled_reaps: total number of times that the reaper couldn't - * run at all because it was disabled. - */ - __u64 disabled_reaps; - - /** - * @disabled_rpc_reaps: total number of times that the reaper skipped - * an RPC because reaping was disabled for that particular RPC - */ - __u64 disabled_rpc_reaps; - - /** - * @reaper_runs: total number of times that the reaper was invoked - * and was not disabled. - */ - __u64 reaper_calls; - - /** - * @reaper_dead_skbs: incremented by hsk->dead_skbs each time that - * reaper_calls is incremented. - */ - __u64 reaper_dead_skbs; - - /** - * @forced_reaps: total number of times that homa_wait_for_message - * invoked the reaper because dead_skbs was too high. - */ - __u64 forced_reaps; - - /** - * @throttle_list_adds: total number of calls to homa_add_to_throttled. - */ - __u64 throttle_list_adds; - - /** - * @throttle_list_checks: number of list elements examined in - * calls to homa_add_to_throttled. - */ - __u64 throttle_list_checks; - - /** - * @fifo_grants: total number of times that grants were sent to - * the oldest message. - */ - __u64 fifo_grants; - - /** - * @fifo_grants_no_incoming: total number of times that, when a - * FIFO grant was issued, the message had no outstanding grants - * (everything granted had been received). - */ - __u64 fifo_grants_no_incoming; - - /** - * @unacked_overflows: total number of times that homa_peer_add_ack - * found insufficient space for the new id and hence had to send an - * ACK message. - */ - __u64 ack_overflows; - - /** - * @ignored_need_acks: total number of times that a NEED_ACK packet - * was ignored because the RPC's result hadn't been fully received. - */ - __u64 ignored_need_acks; - - /** - * @bpage_resuses: total number of times that, when an owned page - * reached the end, it could be reused because all existing - * allocations had been released. - */ - __u64 bpage_reuses; - - /** @temp: For temporary use during testing. */ -#define NUM_TEMP_METRICS 10 - __u64 temp[NUM_TEMP_METRICS]; -}; - -/** - * struct homa_core - Homa allocates one of these structures for each - * core, to hold information that needs to be kept on a per-core basis. - */ -struct homa_core { - - /** - * @last_active: the last time (in get_cycle() units) that - * there was system activity, such NAPI or SoftIRQ, on this - * core. Used to pick a less-busy core for assigning SoftIRQ - * handlers. - */ - __u64 last_active; - - /** - * @last_gro: the last time (in get_cycle() units) that Homa - * processed packets at GRO(NAPI) level on this core. Used to - * avoid assigning SoftIRQ handlers to this core when it has - * been used recently for GRO. - */ - __u64 last_gro; - - /** - * @softirq_backlog: the number of batches of packets that have - * been queued for SoftIRQ processing on this core but haven't - * yet been processed. - */ - atomic_t softirq_backlog; - - /** - * @softirq_offset: used when rotating SoftIRQ assignment among - * the next cores; contains an offset to add to the current core - * to produce the core for SoftIRQ. - */ - int softirq_offset; - - /** - * held_skb: last packet buffer known to be available for - * merging other packets into on this core (note: may not still - * be available), or NULL if none. - */ - struct sk_buff *held_skb; - - /** - * @held_bucket: the index, within napi->gro_hash, of the list - * containing @held_skb; undefined if @held_skb is NULL. Used to - * verify that @held_skb is still available. - */ - int held_bucket; - - /** - * @thread: the most recent thread to invoke a Homa system call - * on this core, or NULL if none. - */ - struct task_struct *thread; - - /** - * @syscall_end_time: the time, in get_cycle() units, when the last - * Homa system call completed on this core. Meaningless if thread - * is NULL. - */ - __u64 syscall_end_time; - - /** @metrics: performance statistics for this core. */ - struct homa_metrics metrics; -}; + int num_peers; +}; /** * struct homa_skb_info - Additional information needed by Homa for each - * sk_buff. Space is allocated for this at the very end of the skb. + * outbound DATA packet. Space is allocated for this at the very end of the + * linear part of the skb. */ struct homa_skb_info { - /** - * @next_skb: used to link together all of the skb's for a Homa - * message (in order of offset). - */ + /** @next_skb: used to link together outgoing skb's for a message. */ struct sk_buff *next_skb; /** @@ -2622,252 +540,51 @@ struct homa_skb_info { * header, CRC, preamble, and inter-packet gap. */ int wire_bytes; -}; - -/** - * homa_get_skb_info() - Return the address of Homa's private information - * for an sk_buff. - * @skb: Socket buffer whose info is needed. - */ -static inline struct homa_skb_info *homa_get_skb_info(struct sk_buff *skb) -{ - return (struct homa_skb_info *) (skb_end_pointer(skb) - - sizeof(struct homa_skb_info)); -} - -/** - * homa_is_client(): returns true if we are the client for a particular RPC, - * false if we are the server. - * @id: Id of the RPC in question. - */ -static inline bool homa_is_client(__u64 id) -{ - return (id & 1) == 0; -} -/** - * homa_local_id(): given an RPC identifier from an input packet (which - * is network-encoded), return the decoded id we should use for that - * RPC on this machine. - * @sender_id: RPC id from an incoming packet, such as h->common.sender_id - */ -static inline __u64 homa_local_id(__be64 sender_id) -{ - /* If the client bit was set on the sender side, it needs to be - * removed here, and conversely. + /** + * @data_bytes: total bytes of message data across all of the + * segments in this packet. */ - return be64_to_cpu(sender_id) ^ 1; -} + int data_bytes; -#define homa_bucket_lock(bucket, type) \ - if (unlikely(!spin_trylock_bh(&bucket->lock))) { \ - __u64 start = get_cycles(); \ - INC_METRIC(type##_lock_misses, 1); \ - spin_lock_bh(&bucket->lock); \ - INC_METRIC(type##_lock_miss_cycles, get_cycles() - start); \ - } + /** @seg_length: maximum number of data bytes in each GSO segment. */ + int seg_length; -/** - * homa_client_rpc_bucket() - Find the bucket containing a given - * client RPC. - * @hsk: Socket associated with the RPC. - * @id: Id of the desired RPC. - * - * Return: The bucket in which this RPC will appear, if the RPC exists. - */ -static inline struct homa_rpc_bucket *homa_client_rpc_bucket( - struct homa_sock *hsk, __u64 id) -{ - /* We can use a really simple hash function here because RPC ids - * are allocated sequentially. + /** + * @offset: offset within the message of the first byte of data in + * this packet. */ - return &hsk->client_rpc_buckets[(id >> 1) - & (HOMA_CLIENT_RPC_BUCKETS - 1)]; -} - -/** - * homa_next_skb() - Compute address of Homa's private link field in @skb. - * @skb: Socket buffer containing private link field. - * - * Homa needs to keep a list of buffers in a message, but it can't use the - * links built into sk_buffs because Homa wants to retain its list even - * after sending the packet, and the built-in links get used during sending. - * Thus we allocate extra space at the very end of the packet's data - * area to hold a forward pointer for a list. - */ -static inline struct sk_buff **homa_next_skb(struct sk_buff *skb) -{ - return (struct sk_buff **) (skb_end_pointer(skb) - sizeof(char*)); -} + int offset; -/** - * port_hash() - Hash function for port numbers. - * @port: Port number being looked up. - * - * Return: The index of the bucket in which this port will be found (if - * it exists. - */ -static inline int homa_port_hash(__u16 port) -{ - /* We can use a really simple hash function here because client - * port numbers are allocated sequentially and server port numbers - * are unpredictable. - */ - return port & (HOMA_SOCKTAB_BUCKETS - 1); -} + /** @rpc: RPC that this packet belongs to. */ + void *rpc; +}; /** - * homa_server_rpc_bucket() - Find the bucket containing a given - * server RPC. - * @hsk: Socket associated with the RPC. - * @id: Id of the desired RPC. - * - * Return: The bucket in which this RPC will appear, if the RPC exists. + * homa_get_skb_info() - Return the address of Homa's private information + * for an sk_buff. + * @skb: Socket buffer whose info is needed. + * Return: address of Homa's private information for @skb. */ -static inline struct homa_rpc_bucket *homa_server_rpc_bucket( - struct homa_sock *hsk, __u64 id) +static inline struct homa_skb_info *homa_get_skb_info(struct sk_buff *skb) { - /* Each client allocates RPC ids sequentially, so they will - * naturally distribute themselves across the hash space. - * Thus we can use the id directly as hash. - */ - return &hsk->server_rpc_buckets[(id >> 1) - & (HOMA_SERVER_RPC_BUCKETS - 1)]; + return (struct homa_skb_info *)(skb_end_pointer(skb)) - 1; } /** * homa_set_doff() - Fills in the doff TCP header field for a Homa packet. - * @h: Packet header whose doff field is to be set. - */ -static inline void homa_set_doff(struct data_header *h) -{ - h->common.doff = (sizeof(struct data_header) - - sizeof(struct data_segment)) << 2; -} - -static inline struct homa_sock *homa_sk(const struct sock *sk) -{ - return (struct homa_sock *)sk; -} - -/** - * homa_sock_lock() - Acquire the lock for a socket. If the socket - * isn't immediately available, record stats on the waiting time. - * @hsk: Socket to lock. - * @locker: Static string identifying where the socket was locked; - * used to track down deadlocks. - */ -static inline void homa_sock_lock(struct homa_sock *hsk, char *locker) { - if (!spin_trylock_bh(&hsk->lock)) { -// printk(KERN_NOTICE "Slow path for socket %d, last locker %s", -// hsk->client_port, hsk->last_locker); - homa_sock_lock_slow(hsk); - } -// hsk->last_locker = locker; -} - -/** - * homa_sock_unlock() - Release the lock for a socket. - * @hsk: Socket to lock. - */ -static inline void homa_sock_unlock(struct homa_sock *hsk) { - spin_unlock_bh(&hsk->lock); -} - -/** - * homa_peer_lock() - Acquire the lock for a peer's @unacked_lock. If the lock - * isn't immediately available, record stats on the waiting time. - * @peer: Peer to lock. - */ -static inline void homa_peer_lock(struct homa_peer *peer) -{ - if (!spin_trylock_bh(&peer->ack_lock)) { - homa_peer_lock_slow(peer); - } -} - -/** - * homa_peer_unlock() - Release the lock for a peer's @unacked_lock. - * @peer: Peer to lock. - */ -static inline void homa_peer_unlock(struct homa_peer *peer) -{ - spin_unlock_bh(&peer->ack_lock); -} - -/** - * homa_protect_rpcs() - Ensures that no RPCs will be reaped for a given - * socket until until homa_sock_unprotect is called. Typically - * used by functions that want to scan the active RPCs for a socket - * without holding the socket lock. Multiple calls to this function may - * be in effect at once. - * @hsk: Socket whose RPCs should be protected. Must not be locked - * by the caller; will be locked here. - * - * Return: 1 for success, 0 if the socket has been shutdown, in which - * case its RPCs cannot be protected. - */ -static inline int homa_protect_rpcs(struct homa_sock *hsk) -{ - int result; - homa_sock_lock(hsk, "homa_sock_protect"); - result = !hsk->shutdown; - if (result) - atomic_inc(&hsk->protect_count); - homa_sock_unlock(hsk); - return result; -} - -/** - * homa_unprotect_rpcs() - Cancel the effect of a previous call to - * homa_sock_protect(), so that RPCs can once again be reaped. - * @hsk: Socket whose RPCs should be unprotected. - */ -static inline void homa_unprotect_rpcs(struct homa_sock *hsk) -{ - atomic_dec(&hsk->protect_count); -} - -/** - * homa_grantable_lock() - Acquire the grantable lock. If the lock - * isn't immediately available, record stats on the waiting time. - * @homa: Overall data about the Homa protocol implementation. - */ -static inline void homa_grantable_lock(struct homa *homa) -{ - if (!spin_trylock_bh(&homa->grantable_lock)) { - homa_grantable_lock_slow(homa); - } -} - -/** - * homa_grantable_unlock() - Release the grantable lock. - * @homa: Overall data about the Homa protocol implementation. - */ -static inline void homa_grantable_unlock(struct homa *homa) -{ - spin_unlock_bh(&homa->grantable_lock); -} - -/** - * homa_throttle_lock() - Acquire the throttle lock. If the lock - * isn't immediately available, record stats on the waiting time. - * @homa: Overall data about the Homa protocol implementation. - */ -static inline void homa_throttle_lock(struct homa *homa) -{ - if (!spin_trylock_bh(&homa->throttle_lock)) { - homa_throttle_lock_slow(homa); - } -} - -/** - * homa_throttle_unlock() - Release the throttle lock. - * @homa: Overall data about the Homa protocol implementation. + * @h: Packet header whose doff field is to be set. + * @size: Size of the "header", bytes (must be a multiple of 4). This + * information is used only for TSO; it's the number of bytes + * that should be replicated in each segment. The bytes after + * this will be distributed among segments. */ -static inline void homa_throttle_unlock(struct homa *homa) +static inline void homa_set_doff(struct homa_data_hdr *h, int size) { - spin_unlock_bh(&homa->throttle_lock); + /* Drop the 2 low-order bits from size and set the 4 high-order + * bits of doff from what's left. + */ + h->common.doff = size << 2; } /** skb_is_ipv6() - Return true if the packet is encapsulated with IPv6, @@ -2878,24 +595,11 @@ static inline bool skb_is_ipv6(const struct sk_buff *skb) return ipv6_hdr(skb)->version == 6; } -/** - * Given an IPv4 address, return an equivalent IPv6 address (an IPv4-mapped - * one) - * @ip4: IPv4 address, in network byte order. - */ -static inline struct in6_addr ipv4_to_ipv6(__be32 ip4) -{ - struct in6_addr ret = {}; - if (ip4 == INADDR_ANY) return in6addr_any; - ret.in6_u.u6_addr32[2] = htonl(0xffff); - ret.in6_u.u6_addr32[3] = ip4; - return ret; -} - /** * ipv6_to_ipv4() - Given an IPv6 address produced by ipv4_to_ipv6, return * the original IPv4 address (in network byte order). * @ip6: IPv6 address; assumed to be a mapped IPv4 address. + * Return: IPv4 address stored in @ip6. */ static inline __be32 ipv6_to_ipv4(const struct in6_addr ip6) { @@ -2903,20 +607,24 @@ static inline __be32 ipv6_to_ipv4(const struct in6_addr ip6) } /** - * skb_canonical_ipv6_addr() - Convert a socket address to the "standard" + * canonical_ipv6_addr() - Convert a socket address to the "standard" * form used in Homa, which is always an IPv6 address; if the original address * was IPv4, convert it to an IPv4-mapped IPv6 address. - * @addr: Address to canonicalize. + * @addr: Address to canonicalize (if NULL, "any" is returned). + * Return: IPv6 address corresponding to @addr. */ -static inline struct in6_addr canonical_ipv6_addr(const sockaddr_in_union *addr) +static inline struct in6_addr canonical_ipv6_addr(const union sockaddr_in_union + *addr) { + struct in6_addr mapped; + if (addr) { - return (addr->sa.sa_family == AF_INET6) - ? addr->in6.sin6_addr - : ipv4_to_ipv6(addr->in4.sin_addr.s_addr); - } else { - return in6addr_any; + if (addr->sa.sa_family == AF_INET6) + return addr->in6.sin6_addr; + ipv6_addr_set_v4mapped(addr->in4.sin_addr.s_addr, &mapped); + return mapped; } + return in6addr_any; } /** @@ -2925,290 +633,377 @@ static inline struct in6_addr canonical_ipv6_addr(const sockaddr_in_union *addr) * address; if the original address was IPv4, convert it to an IPv4-mapped * IPv6 address. * @skb: The source address will be extracted from this packet buffer. + * Return: IPv6 address for @skb's source machine. */ static inline struct in6_addr skb_canonical_ipv6_saddr(struct sk_buff *skb) { - return skb_is_ipv6(skb) ? ipv6_hdr(skb)->saddr : ipv4_to_ipv6( - ip_hdr(skb)->saddr); + struct in6_addr mapped; + + if (skb_is_ipv6(skb)) + return ipv6_hdr(skb)->saddr; + ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &mapped); + return mapped; } +#ifndef __STRIP__ /* See strip.py */ /** - * is_mapped_ipv4() - Return true if an IPv6 address is actually an - * IPv4-mapped address, false otherwise. - * @x: The address to check. + * is_homa_pkt() - Return true if @skb is a Homa packet, false otherwise. + * @skb: Packet buffer to check. + * Return: see above. */ -static inline bool is_mapped_ipv4(const struct in6_addr x) +static inline bool is_homa_pkt(struct sk_buff *skb) { - return ((x.in6_u.u6_addr32[0] == 0) && - (x.in6_u.u6_addr32[1] == 0) && - (x.in6_u.u6_addr32[2] == htonl(0xffff))); + int protocol; + + /* If the network header hasn't been created yet, assume it's a + * Homa packet (Homa never generates any non-Homa packets). + */ + if (skb->network_header == 0) + return true; + protocol = (skb_is_ipv6(skb)) ? ipv6_hdr(skb)->nexthdr : + ip_hdr(skb)->protocol; + return (protocol == IPPROTO_HOMA || + (protocol == IPPROTO_TCP && + tcp_hdr(skb)->urg_ptr == htons(HOMA_TCP_URGENT))); + return protocol == IPPROTO_HOMA; } +#endif /* See strip.py */ /** - * tt_addr() - Given an address, return a 4-byte id that will (hopefully) - * provide a unique identifier for the address in a timetrace record. - * @x: Address (either IPv6 or IPv4-mapped IPv6) + * homa_make_header_avl() - Invokes pskb_may_pull to make sure that all the + * Homa header information for a packet is in the linear part of the skb + * where it can be addressed using skb_transport_header. + * @skb: Packet for which header is needed. + * Return: The result of pskb_may_pull (true for success) */ -static inline __be32 tt_addr(const struct in6_addr x) +static inline bool homa_make_header_avl(struct sk_buff *skb) { - return is_mapped_ipv4(x) ? ntohl(x.in6_u.u6_addr32[3]) - : (x.in6_u.u6_addr32[3] ? ntohl(x.in6_u.u6_addr32[3]) - : ntohl(x.in6_u.u6_addr32[1])); -} - -#define INC_METRIC(metric, count) \ - (homa_cores[raw_smp_processor_id()]->metrics.metric) += (count) + int pull_length; -extern struct homa_core *homa_cores[]; + pull_length = skb_transport_header(skb) - skb->data + HOMA_MAX_HEADER; + if (pull_length > skb->len) + pull_length = skb->len; + return pskb_may_pull(skb, pull_length); +} +#ifndef __UPSTREAM__ /* See strip.py */ #ifdef __UNIT_TEST__ -extern void unit_log_printf(const char *separator, const char* format, ...) - __attribute__((format(printf, 2, 3))); +void unit_log_printf(const char *separator, const char *format, ...) + __printf(2, 3); #define UNIT_LOG unit_log_printf -extern void unit_hook(char *id); +void unit_hook(char *id); #define UNIT_HOOK(msg) unit_hook(msg) -#else +#else /* __UNIT_TEST__ */ #define UNIT_LOG(...) -#define UNIT_HOOK(msg) -#endif +#define UNIT_HOOK(...) +#endif /* __UNIT_TEST__ */ +#endif /* See strip.py */ + +extern unsigned int homa_net_id; + +void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, + struct homa_rpc *rpc); +void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb); +int homa_bind(struct socket *sk, struct sockaddr *addr, + int addr_len); +void homa_close(struct sock *sock, long timeout); +int homa_copy_to_user(struct homa_rpc *rpc); +void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc); +void homa_destroy(struct homa *homa); +void homa_dispatch_pkts(struct sk_buff *skb); +int homa_err_handler_v4(struct sk_buff *skb, u32 info); +int homa_err_handler_v6(struct sk_buff *skb, + struct inet6_skb_parm *opt, u8 type, u8 code, + int offset, __be32 info); +int homa_fill_data_interleaved(struct homa_rpc *rpc, + struct sk_buff *skb, struct iov_iter *iter); +struct homa_gap *homa_gap_alloc(struct list_head *next, int start, int end); +int homa_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); +int homa_hash(struct sock *sk); +enum hrtimer_restart homa_hrtimer(struct hrtimer *timer); +int homa_init(struct homa *homa); +int homa_ioc_info(struct socket *sock, unsigned long arg); +int homa_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); +int homa_load(void); +int homa_message_out_fill(struct homa_rpc *rpc, + struct iov_iter *iter, int xmit); +void homa_message_out_init(struct homa_rpc *rpc, int length); +void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, + struct homa_rpc *rpc); +void homa_net_destroy(struct homa_net *hnet); +void homa_net_exit(struct net *net); +int homa_net_init(struct homa_net *hnet, struct net *net, + struct homa *homa); +int homa_net_start(struct net *net); +__poll_t homa_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait); +int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int flags, int *addr_len); +void homa_request_retrans(struct homa_rpc *rpc); +void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, + struct homa_sock *hsk); +void homa_rpc_handoff(struct homa_rpc *rpc); +int homa_rpc_tx_end(struct homa_rpc *rpc); +int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); +int homa_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen); +int homa_shutdown(struct socket *sock, int how); +int homa_socket(struct sock *sk); +int homa_softirq(struct sk_buff *skb); +void homa_spin(int ns); +void homa_timer(struct homa *homa); +void homa_timer_check_rpc(struct homa_rpc *rpc); +int homa_timer_main(void *transport); +struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, + struct iov_iter *iter, int offset, + int length, int max_seg_data); +void homa_unhash(struct sock *sk); +void homa_rpc_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc); +void homa_unload(void); +int homa_wait_private(struct homa_rpc *rpc, int nonblocking); +struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking); +int homa_xmit_control(enum homa_packet_type type, void *contents, + size_t length, struct homa_rpc *rpc); +int __homa_xmit_control(void *contents, size_t length, + struct homa_peer *peer, struct homa_sock *hsk); +void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk); + +#ifndef __STRIP__ /* See strip.py */ +void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk); +int homa_dointvec(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +void homa_incoming_sysctl_changed(struct homa *homa); +int homa_ioc_abort(struct socket *sock, unsigned long arg); +int homa_message_in_init(struct homa_rpc *rpc, int length, + int unsched); +void homa_prios_changed(struct homa *homa); +void homa_resend_data(struct homa_rpc *rpc, int start, int end, + int priority); +int homa_sysctl_softirq_cores(const struct ctl_table *table, + int write, void *buffer, size_t *lenp, + loff_t *ppos); +int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, + int length); +void homa_xmit_data(struct homa_rpc *rpc, bool force); +void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, + int priority); +#else /* See strip.py */ +int homa_message_in_init(struct homa_rpc *rpc, int unsched); +void homa_resend_data(struct homa_rpc *rpc, int start, int end); +void homa_xmit_data(struct homa_rpc *rpc); +void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc); +#endif /* See strip.py */ + +/** + * homa_net() - Return the struct homa_net associated with a particular + * struct net. + * @net: Get the Homa data for this net namespace. + * Return: see above. + */ +static inline struct homa_net *homa_net(struct net *net) +{ + return (struct homa_net *)net_generic(net, homa_net_id); +} + +/** + * homa_clock() - Return a fine-grain clock value that is monotonic and + * consistent across cores. + * Return: see above. + */ +static inline u64 homa_clock(void) +{ + /* This function exists to make it easy to switch time sources + * if/when new or better sources become available. + */ +#ifdef __UNIT_TEST__ + u64 mock_get_clock(void); + return mock_get_clock(); +#else /* __UNIT_TEST__ */ +#ifndef __UPSTREAM__ /* See strip.py */ + /* As of August 2025, get_cycles takes only about 8 ns/call, vs. + * 14 ns/call for ktime_get_ns. This saves about .24 core when + * driving a 25 Gbps network at high load (see perf.txt for details). + * Unfortunately, Linux reviewers will not allow get_cycles in the + * upstreamed version. + */ + return get_cycles(); +#else /* See strip.py */ + return ktime_get_ns(); +#endif /* See strip.py */ +#endif /* __UNIT_TEST__ */ +} + +/** + * homa_clock_khz() - Return the frequency of the values returned by + * homa_clock, in units of KHz. + * Return: see above. + */ +static inline u64 homa_clock_khz(void) +{ +#ifdef __UNIT_TEST__ + return 1000000; +#else /* __UNIT_TEST__ */ +#ifndef __UPSTREAM__ /* See strip.py */ + return cpu_khz; +#else /* See strip.py */ + return 1000000; +#endif /* See strip.py */ +#endif /* __UNIT_TEST__ */ +} -extern void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, - int port, int error); -extern void homa_abort_sock_rpcs(struct homa_sock *hsk, int error); -extern void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, - struct homa_rpc *rpc, struct homa_lcache *lcache); -extern void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb); -extern void homa_add_to_throttled(struct homa_rpc *rpc); -extern void homa_append_metric(struct homa *homa, const char* format, ...); -extern int homa_backlog_rcv(struct sock *sk, struct sk_buff *skb); -extern int homa_bind(struct socket *sk, struct sockaddr *addr, - int addr_len); -extern void homa_check_grantable(struct homa *homa, struct homa_rpc *rpc); -extern int homa_check_rpc(struct homa_rpc *rpc); -extern int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, - bool force); -extern void homa_close(struct sock *sock, long timeout); -extern int homa_copy_to_user(struct homa_rpc *rpc); -extern void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk); -extern void homa_data_from_server(struct sk_buff *skb, - struct homa_rpc *crpc); -extern void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc, - struct homa_lcache *lcache, int *delta); -extern void homa_destroy(struct homa *homa); -extern int homa_diag_destroy(struct sock *sk, int err); -extern int homa_disconnect(struct sock *sk, int flags); -extern int homa_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -extern void homa_dst_refresh(struct homa_peertab *peertab, - struct homa_peer *peer, struct homa_sock *hsk); -extern int homa_err_handler_v4(struct sk_buff *skb, u32 info); -extern int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm * - , u8, u8, int, __be32); -extern struct homa_rpc - *homa_find_client_rpc(struct homa_sock *hsk, __u64 id); -extern struct homa_rpc - *homa_find_server_rpc(struct homa_sock *hsk, - const struct in6_addr *saddr, __u16 sport, __u64 id); -extern void homa_free_skbs(struct sk_buff *skb); -extern void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, - char *format); -extern int homa_get_port(struct sock *sk, unsigned short snum); -extern void homa_get_resend_range(struct homa_message_in *msgin, - struct resend_header *resend); -extern int homa_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *option); -extern int homa_grant_fifo(struct homa *homa); -extern void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc); -extern int homa_gro_complete(struct sk_buff *skb, int thoff); -extern struct sk_buff - *homa_gro_receive(struct list_head *gro_list, - struct sk_buff *skb); -extern struct sk_buff - *homa_gso_segment(struct sk_buff *skb, - netdev_features_t features); -extern int homa_hash(struct sock *sk); -extern enum hrtimer_restart - homa_hrtimer(struct hrtimer *timer); -extern int homa_init(struct homa *homa); -extern void homa_incoming_sysctl_changed(struct homa *homa); -extern int homa_ioc_abort(struct sock *sk, unsigned long arg); -extern int homa_ioctl(struct sock *sk, int cmd, unsigned long arg); -extern void homa_log_grantable_list(struct homa *homa); -extern void homa_log_throttled(struct homa *homa); -extern void homa_message_in_init(struct homa_message_in *msgin, int length, - int incoming); -extern int homa_message_out_init(struct homa_rpc *rpc, - struct iov_iter *iter, int xmit); -extern loff_t homa_metrics_lseek(struct file *file, loff_t offset, - int whence); -extern int homa_metrics_open(struct inode *inode, struct file *file); -extern ssize_t homa_metrics_read(struct file *file, char __user *buffer, - size_t length, loff_t *offset); -extern int homa_metrics_release(struct inode *inode, struct file *file); -extern void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, - struct homa_rpc *rpc); -extern int homa_offload_end(void); -extern int homa_offload_init(void); -extern void homa_outgoing_sysctl_changed(struct homa *homa); -extern int homa_pacer_main(void *transportInfo); -extern void homa_pacer_stop(struct homa *homa); -extern void homa_pacer_xmit(struct homa *homa); -extern void homa_peertab_destroy(struct homa_peertab *peertab); -extern int homa_peertab_init(struct homa_peertab *peertab); -extern void homa_peer_add_ack(struct homa_rpc *rpc); -extern struct homa_peer - *homa_peer_find(struct homa_peertab *peertab, - const struct in6_addr *addr, struct inet_sock *inet); -extern int homa_peer_get_acks(struct homa_peer *peer, int count, - struct homa_ack *dst); -extern struct dst_entry - *homa_peer_get_dst(struct homa_peer *peer, - struct inet_sock *inet); -extern void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, - int c2, int c3, int c4, int c5, int c6, int c7); -extern void homa_peertab_gc_dsts(struct homa_peertab *peertab, __u64 now); -extern void homa_pkt_dispatch(struct sk_buff *skb, struct homa_sock *hsk, - struct homa_lcache *lcache, int *delta); -extern __poll_t homa_poll(struct file *file, struct socket *sock, - struct poll_table_struct *wait); -extern int homa_pool_allocate(struct homa_rpc *rpc); -extern void homa_pool_destroy(struct homa_pool *pool); -extern void *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, - int *available); -extern int homa_pool_get_pages(struct homa_pool *pool, int num_pages, - __u32 *pages, int leave_locked); -extern int homa_pool_init(struct homa_pool *pool, struct homa *homa, - void *buf_region, __u64 region_size); -extern void homa_pool_release_buffers(struct homa_pool *pool, - int num_buffers, __u32 *buffers); -extern char *homa_print_ipv4_addr(__be32 addr); -extern char *homa_print_ipv6_addr(const struct in6_addr *addr); -extern char *homa_print_metrics(struct homa *homa); -extern char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len); -extern char *homa_print_packet_short(struct sk_buff *skb, char *buffer, - int buf_len); -extern void homa_prios_changed(struct homa *homa); -extern int homa_proc_read_metrics(char *buffer, char **start, off_t offset, - int count, int *eof, void *data); -extern int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int noblock, int flags, int *addr_len); -extern int homa_register_interests(struct homa_interest *interest, - struct homa_sock *hsk, int flags, __u64 id); -extern void homa_rehash(struct sock *sk); -extern void homa_remove_grantable_locked(struct homa *homa, - struct homa_rpc *rpc); -extern void homa_remove_from_grantable(struct homa *homa, - struct homa_rpc *rpc); -extern void homa_remove_from_throttled(struct homa_rpc *rpc); -extern void homa_resend_data(struct homa_rpc *rpc, int start, int end, - int priority); -extern void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, - struct homa_sock *hsk); -extern void homa_rpc_abort(struct homa_rpc *crpc, int error); -extern void homa_rpc_acked(struct homa_sock *hsk, - const struct in6_addr *saddr, struct homa_ack *ack); -extern void homa_rpc_free(struct homa_rpc *rpc); -extern void homa_rpc_free_rcu(struct rcu_head *rcu_head); -extern void homa_rpc_handoff(struct homa_rpc *rpc); -extern void homa_rpc_log(struct homa_rpc *rpc); -extern void homa_rpc_log_active(struct homa *homa, uint64_t id); -extern struct homa_rpc - *homa_rpc_new_client(struct homa_sock *hsk, - const sockaddr_in_union *dest); -extern struct homa_rpc - *homa_rpc_new_server(struct homa_sock *hsk, - const struct in6_addr *source, struct data_header *h); -extern int homa_rpc_reap(struct homa_sock *hsk, int count); -extern void homa_send_grants(struct homa *homa); -extern int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); -extern int homa_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags); -extern int homa_setsockopt(struct sock *sk, int level, int optname, - sockptr_t __user optval, unsigned int optlen); -extern int homa_shutdown(struct socket *sock, int how); -extern int homa_snprintf(char *buffer, int size, int used, - const char* format, ...) - __attribute__((format(printf, 4, 5))); -extern int homa_sock_bind(struct homa_socktab *socktab, - struct homa_sock *hsk, __u16 port); -extern void homa_sock_destroy(struct homa_sock *hsk); -extern struct homa_sock * - homa_sock_find(struct homa_socktab *socktab, __u16 port); -extern void homa_sock_init(struct homa_sock *hsk, struct homa *homa); -extern void homa_sock_shutdown(struct homa_sock *hsk); -extern int homa_socket(struct sock *sk); -extern void homa_socktab_destroy(struct homa_socktab *socktab); -extern void homa_socktab_init(struct homa_socktab *socktab); -extern struct homa_sock - *homa_socktab_next(struct homa_socktab_scan *scan); -extern struct homa_sock - *homa_socktab_start_scan(struct homa_socktab *socktab, - struct homa_socktab_scan *scan); -extern int homa_softirq(struct sk_buff *skb); -extern void homa_spin(int usecs); -extern char *homa_symbol_for_state(struct homa_rpc *rpc); -extern char *homa_symbol_for_type(uint8_t type); -extern void homa_timer(struct homa *homa); -extern int homa_timer_main(void *transportInfo); -extern void homa_unhash(struct sock *sk); -extern void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc); -extern int homa_unsched_priority(struct homa *homa, - struct homa_peer *peer, int length); -extern int homa_v4_early_demux(struct sk_buff *skb); -extern int homa_v4_early_demux_handler(struct sk_buff *skb); -extern struct homa_rpc - *homa_wait_for_message(struct homa_sock *hsk, int flags, - __u64 id); -extern int homa_xmit_control(enum homa_packet_type type, void *contents, - size_t length, struct homa_rpc *rpc); -extern int __homa_xmit_control(void *contents, size_t length, - struct homa_peer *peer, struct homa_sock *hsk); -extern void homa_xmit_data(struct homa_rpc *rpc, bool force); -extern void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, - int priority); -extern void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk); +/** + * homa_ns_to_cycles() - Convert from units of nanoseconds to units of + * homa_clock(). + * @ns: A time measurement in nanoseconds + * Return: The time in homa_clock() units corresponding to @ns. + */ +static inline u64 homa_ns_to_cycles(u64 ns) +{ +#ifdef __UNIT_TEST__ + return ns; +#else /* __UNIT_TEST__ */ + u64 tmp; + tmp = ns * homa_clock_khz(); + do_div(tmp, 1000000); + return tmp; +#endif /* __UNIT_TEST__ */ +} + +#ifndef __STRIP__ /* See strip.py */ /** - * homa_check_pacer() - This method is invoked at various places in Homa to - * see if the pacer needs to transmit more packets and, if so, transmit - * them. It's needed because the pacer thread may get descheduled by - * Linux, result in output stalls. - * @homa: Overall data about the Homa protocol implementation. No locks - * should be held when this function is invoked. - * @softirq: Nonzero means this code is running at softirq (bh) level; - * zero means it's running in process context. + * homa_usecs_to_cycles() - Convert from units of microseconds to units of + * homa_clock(). + * @usecs: A time measurement in microseconds + * Return: The time in homa_clock() units corresponding to @usecs. */ -static inline void homa_check_pacer(struct homa *homa, int softirq) +static inline u64 homa_usecs_to_cycles(u64 usecs) { - if (list_empty(&homa->throttled_rpcs)) - return; +#ifdef __UNIT_TEST__ + return usecs * 1000; +#else /* __UNIT_TEST__ */ + u64 tmp; - /* The "/2" in the line below gives homa_pacer_main the first chance - * to queue new packets; if the NIC queue becomes more than half - * empty, then we will help out here. - */ - if ((get_cycles() + homa->max_nic_queue_cycles/2) < - atomic64_read(&homa->link_idle_time)) - return; - tt_record("homa_check_pacer calling homa_pacer_xmit"); - homa_pacer_xmit(homa); - INC_METRIC(pacer_needed_help, 1); + tmp = usecs * homa_clock_khz(); + do_div(tmp, 1000); + return tmp; +#endif /* __UNIT_TEST__ */ } +#endif /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ /** - * homa_get_dst() - Returns destination information associated with a peer, - * updating it if the cached information is stale. - * @peer: Peer whose destination information is desired. - * @hsk: Homa socket; needed by lower-level code to recreate the dst. - * Return Up-to-date destination for peer. + * homa_high_priority() - Return the next-to-highest available priority + * level. Used in situations where we want to boost the priority of + * something but don't want to interfere with the highest priority packets + * such as control packets. + * @homa: Overall information about the Homa protocol. + * Return: See above. + * */ -static inline struct dst_entry *homa_get_dst(struct homa_peer *peer, - struct homa_sock *hsk) +static inline int homa_high_priority(struct homa *homa) { - if (unlikely(peer->dst->obsolete > 0)) - homa_dst_refresh(&hsk->homa->peers, peer, hsk); - return peer->dst; + return (homa->num_priorities <= 2) ? 0 : homa->num_priorities - 2; } +#endif /* See strip.py */ + +/* Homa Locking Strategy: + * + * (Note: this documentation is referenced in several other places in the + * Homa code) + * + * In the Linux TCP/IP stack the primary locking mechanism is a sleep-lock + * per socket. However, per-socket locks aren't adequate for Homa, because + * sockets are "larger" in Homa. In TCP, a socket corresponds to a single + * connection between two peers; an application can have hundreds or + * thousands of sockets open at once, so per-socket locks leave lots of + * opportunities for concurrency. With Homa, a single socket can be used for + * communicating with any number of peers, so there will typically be just + * one socket per thread. As a result, a single Homa socket must support many + * concurrent RPCs efficiently, and a per-socket lock would create a bottleneck + * (Homa tried this approach initially). + * + * Thus, the primary locks used in Homa spinlocks at RPC granularity. This + * allows operations on different RPCs for the same socket to proceed + * concurrently. Homa also has socket locks (which are spinlocks different + * from the official socket sleep-locks) but these are used much less + * frequently than RPC locks. + * + * Lock Ordering: + * + * There are several other locks in Homa besides RPC locks, all of which + * are spinlocks. When multiple locks are held, they must be acquired in a + * consistent order in order to prevent deadlock. Here are the rules for Homa: + * 1. Except for RPC and socket locks, all locks should be considered + * "leaf" locks: don't acquire other locks while holding them. + * 2. The lock order is: + * * RPC lock + * * Socket lock + * * Other lock + * + * It may seem surprising that RPC locks are acquired *before* socket locks, + * but this is essential for high performance. Homa has been designed so that + * many common operations (such as processing input packets) can be performed + * while holding only an RPC lock; this allows operations on different RPCs + * to proceed in parallel. Only a few operations, such as handing off an + * incoming message to a waiting thread, require the socket lock. If socket + * locks had to be acquired first, any operation that might eventually need + * the socket lock would have to acquire it before the RPC lock, which would + * severely restrict concurrency. + * + * Socket Shutdown: + * + * It is possible for socket shutdown to begin while operations are underway + * that hold RPC locks but not the socket lock. For example, a new RPC + * creation might be underway when a socket is shut down. The RPC creation + * will eventually acquire the socket lock and add the new RPC to those + * for the socket; it would be very bad if this were to happen after + * homa_sock_shutdown things is has deleted all RPCs for the socket. + * In general, any operation that acquires a socket lock must check + * hsk->shutdown after acquiring the lock and abort if hsk->shutdown is set. + * + * Spinlock Implications: + * + * Homa uses spinlocks exclusively; this is needed because locks typically + * need to be acquired at atomic level, such as in SoftIRQ code. + * + * Operations that can block, such as memory allocation and copying data + * to/from user space, are not permitted while holding spinlocks (spinlocks + * disable interrupts, so the holder must not block. This results in awkward + * code in several places to move restricted operations outside locked + * regions. Such code typically looks like this: + * - Acquire a reference on an object such as an RPC, in order to prevent + * the object from being deleted. + * - Release the object's lock. + * - Perform the restricted operation. + * - Re-acquire the lock. + * - Release the reference. + * It is possible that the object may have been modified by some other party + * while it was unlocked, so additional checks may be needed after reacquiring + * the lock. As one example, an RPC may have been terminated, in which case + * any operation in progress on that RPC should be aborted after reacquiring + * the lock. + * + * Lists of RPCs: + * + * There are a few places where Homa needs to process all of the RPCs + * associated with a socket, such as the timer. Such code must first lock + * the socket (to protect access to the link pointers) then lock + * individual RPCs on the list. However, this violates the rules for locking + * order. It isn't safe to unlock the socket before locking the individual RPCs, + * because RPCs could be deleted and their memory recycled between the unlock + * of the socket lock and the lock of the RPC; this could result in corruption. + * Homa uses two different approaches to handle this situation: + * 1. Use ``homa_protect_rpcs`` to prevent RPC reaping for a socket. RPCs can + * still be terminated, but their memory won't go away until + * homa_unprotect_rpcs is invoked. This allows the socket lock to be + * released before acquiring RPC locks; after acquiring each RPC lock, + * the RPC must be checked to see if it has been terminated; if so, skip it. + * 2. Use ``spin_trylock_bh`` to acquire the RPC lock while still holding the + * socket lock. If this fails, then release the socket lock and retry + * both the socket lock and the RPC lock. Of course, the state of both + * socket and RPC could change before the locks are finally acquired. + */ -extern struct completion homa_pacer_kthread_done; #endif /* _HOMA_IMPL_H */ diff --git a/homa_incoming.c b/homa_incoming.c index 6723027b..06181802 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1,51 +1,156 @@ -/* Copyright (c) 2019-2023 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ +#ifndef __STRIP__ /* See strip.py */ /* This file contains functions that handle incoming Homa messages, including - * both receiving information for those messages and sending grants. */ + * both receiving information for those messages and sending grants. + */ +#else /* See strip.py */ +/* This file contains functions that handle incoming Homa messages. */ +#endif /* See strip.py */ #include "homa_impl.h" -#include "homa_lcache.h" +#include "homa_interest.h" +#include "homa_peer.h" +#include "homa_pool.h" +#ifndef __STRIP__ /* See strip.py */ +#include "homa_grant.h" +#include "homa_offload.h" +#endif /* See strip.py */ + +#ifndef __STRIP__ /* See strip.py */ /** * homa_message_in_init() - Constructor for homa_message_in. - * @msgin: Structure to initialize. + * @rpc: RPC whose msgin structure should be initialized. * @length: Total number of bytes in message. - * @incoming: The number of unscheduled bytes the sender is planning + * @unsched: The number of unscheduled bytes the sender is planning * to transmit. + * Return: Zero for successful initialization, or a negative errno + * if rpc->msgin could not be initialized. */ -void homa_message_in_init(struct homa_message_in *msgin, int length, - int incoming) +int homa_message_in_init(struct homa_rpc *rpc, int length, int unsched) +#else /* See strip.py */ +/** + * homa_message_in_init() - Constructor for homa_message_in. + * @rpc: RPC whose msgin structure should be initialized. The + * msgin struct is assumed to be zeroes. + * @length: Total number of bytes in message. + * Return: Zero for successful initialization, or a negative errno + * if rpc->msgin could not be initialized. + */ +int homa_message_in_init(struct homa_rpc *rpc, int length) +#endif /* See strip.py */ + __must_hold(rpc->bucket->lock) { - msgin->total_length = length; - skb_queue_head_init(&msgin->packets); - msgin->num_skbs = 0; - msgin->bytes_remaining = length; - msgin->incoming = (incoming > length) ? length : incoming; - msgin->priority = 0; - msgin->scheduled = length > incoming; - if (length < HOMA_NUM_SMALL_COUNTS*64) { - INC_METRIC(small_msg_bytes[(length-1) >> 6], length); - } else if (length < HOMA_NUM_MEDIUM_COUNTS*1024) { - INC_METRIC(medium_msg_bytes[(length-1) >> 10], length); + int err; + + if (length > HOMA_MAX_MESSAGE_LENGTH) + return -EINVAL; + + rpc->msgin.length = length; + __skb_queue_head_init(&rpc->msgin.packets); + INIT_LIST_HEAD(&rpc->msgin.gaps); + rpc->msgin.bytes_remaining = length; + IF_NO_STRIP(rpc->msgin.birth = homa_clock()); + err = homa_pool_alloc_msg(rpc); + if (err != 0) { + rpc->msgin.length = -1; + return err; + } +#ifndef __STRIP__ /* See strip.py */ + homa_grant_init_rpc(rpc, unsched); + if (length < HOMA_NUM_SMALL_COUNTS * 64) { + INC_METRIC(small_msg_bytes[(length - 1) >> 6], length); + } else if (length < HOMA_NUM_MEDIUM_COUNTS * 1024) { + INC_METRIC(medium_msg_bytes[(length - 1) >> 10], length); } else { INC_METRIC(large_msg_count, 1); INC_METRIC(large_msg_bytes, length); } - msgin->copied_out = 0; - msgin->num_bpages = 0; + if (homa_is_client(rpc->id)) { + INC_METRIC(client_responses_started, 1); + INC_METRIC(client_response_bytes_started, length); + } else { + INC_METRIC(server_requests_started, 1); + INC_METRIC(server_request_bytes_started, length); + } +#endif /* See strip.py */ + return 0; +} + +/** + * homa_gap_alloc() - Allocate a new gap and add it to a gap list. + * @next: Add the new gap just before this list element. + * @start: Offset of first byte covered by the gap. + * @end: Offset of byte just after the last one covered by the gap. + * Return: Pointer to the new gap, or NULL if memory couldn't be allocated + * for the gap object. + */ +struct homa_gap *homa_gap_alloc(struct list_head *next, int start, int end) +{ + struct homa_gap *gap; + + gap = kmalloc(sizeof(*gap), GFP_ATOMIC); + if (!gap) + return NULL; + gap->start = start; + gap->end = end; + gap->time = homa_clock(); + list_add_tail(&gap->links, next); + return gap; +} + +/** + * homa_request_retrans() - The function is invoked when it appears that + * data packets for a message have been lost. It issues RESEND requests + * as appropriate and may modify the state of the RPC. + * @rpc: RPC for which incoming data is delinquent; must be locked by + * caller. + */ +void homa_request_retrans(struct homa_rpc *rpc) + __must_hold(rpc->bucket->lock) +{ + struct homa_resend_hdr resend; + struct homa_gap *gap; + int offset, length; + +#ifndef __STRIP__ /* See strip.py */ + resend.priority = rpc->hsk->homa->num_priorities - 1; +#endif /* See strip.py */ + + if (rpc->msgin.length >= 0) { + /* Issue RESENDS for any gaps in incoming data. */ + list_for_each_entry(gap, &rpc->msgin.gaps, links) { + resend.offset = htonl(gap->start); + resend.length = htonl(gap->end - gap->start); + tt_record4("Sending RESEND for id %d, peer 0x%x, offset %d, length %d", + rpc->id, tt_addr(rpc->peer->addr), + gap->start, gap->end - gap->start); + homa_xmit_control(RESEND, &resend, sizeof(resend), rpc); + } + + /* Issue a RESEND for any granted data after the last gap. */ + offset = rpc->msgin.recv_end; +#ifndef __STRIP__ /* See strip.py */ + length = rpc->msgin.granted - rpc->msgin.recv_end; +#else /* See strip.py */ + length = rpc->msgin.length - rpc->msgin.recv_end; +#endif /* See strip.py */ + if (length <= 0) + return; + } else { + /* No data has been received for the RPC. Ask the sender to + * resend everything it has sent so far. + */ + offset = 0; + length = -1; + } + + resend.offset = htonl(offset); + resend.length = htonl(length); + tt_record4("Sending RESEND for id %d, peer 0x%x, offset %d, length %d", + rpc->id, tt_addr(rpc->peer->addr), offset, offset + length); + homa_xmit_control(RESEND, &resend, sizeof(resend), rpc); } /** @@ -53,64 +158,129 @@ void homa_message_in_init(struct homa_message_in *msgin, int length, * partially received message. * @rpc: Add the packet to the msgin for this RPC. * @skb: The new packet. This function takes ownership of the packet - * and will free it, if it doesn't get added to msgin (because - * it provides no new data). + * (the packet will either be freed or added to rpc->msgin.packets). */ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) + __must_hold(rpc->bucket->lock) { - struct data_header *h = (struct data_header *) skb->data; - int offset = ntohl(h->seg.offset); - int data_bytes = ntohl(h->seg.segment_length); - struct sk_buff *skb2; - - /* Any data from the packet with offset less than this is - * of no value.*/ - int floor = rpc->msgin.copied_out; - - /* Any data with offset >= this is useless. */ - int ceiling = rpc->msgin.total_length; - - /* Figure out where in the list of existing packets to insert the - * new one. It doesn't necessarily go at the end, but it almost - * always will in practice, so work backwards from the end of the - * list. - */ - skb_queue_reverse_walk(&rpc->msgin.packets, skb2) { - struct data_header *h2 = (struct data_header *) skb2->data; - int offset2 = ntohl(h2->seg.offset); - int data_bytes2 = ntohl(h2->seg.segment_length); - if (offset2 < offset) { - floor = offset2 + data_bytes2; - break; + struct homa_data_hdr *h = (struct homa_data_hdr *)skb->data; + struct homa_gap *gap, *dummy, *gap2; + int start = ntohl(h->seg.offset); + int length = homa_data_len(skb); + enum skb_drop_reason reason; + int end = start + length; + + if ((start + length) > rpc->msgin.length) { + tt_record3("Packet extended past message end; id %d, offset %d, length %d", + rpc->id, start, length); + reason = SKB_DROP_REASON_PKT_TOO_BIG; + goto discard; + } + + if (start == rpc->msgin.recv_end) { + /* Common case: packet is sequential. */ + rpc->msgin.recv_end += length; + goto keep; + } + + if (start > rpc->msgin.recv_end) { + /* Packet creates a new gap. */ + if (!homa_gap_alloc(&rpc->msgin.gaps, + rpc->msgin.recv_end, start)) { + tt_record2("Couldn't allocate gap for id %d (start %d): no memory", + rpc->id, start); + reason = SKB_DROP_REASON_NOMEM; + goto discard; } - ceiling = offset2; + rpc->msgin.recv_end = end; + goto keep; } - /* New packet goes right after skb2. If this packet overlaps either - * of its neighbors, then it is discarded (partial overlaps are - * not permitted). + /* Must now check to see if the packet fills in part or all of + * an existing gap. */ - if ((offset < floor) || ((offset + data_bytes) > ceiling)) { - /* This packet is redundant. */ -// char buffer[100]; -// printk(KERN_NOTICE "redundant Homa packet: %s\n", -// homa_print_packet(skb, buffer, sizeof(buffer))); - INC_METRIC(redundant_packets, 1); - tt_record4("homa_add_packet discarding packet for id %d, " - "offset %d, copied_out %d, remaining %d", - rpc->id, offset, rpc->msgin.copied_out, - rpc->msgin.total_length); - kfree_skb(skb); - return; + list_for_each_entry_safe(gap, dummy, &rpc->msgin.gaps, links) { + /* Is packet at the start of this gap? */ + if (start <= gap->start) { + if (end <= gap->start) + continue; + if (start < gap->start) { + tt_record4("Packet overlaps gap start: id %d, start %d, end %d, gap_start %d", + rpc->id, start, end, gap->start); + reason = SKB_DROP_REASON_DUP_FRAG; + goto discard; + } + if (end > gap->end) { + tt_record4("Packet overlaps gap end: id %d, start %d, end %d, gap_end %d", + rpc->id, start, end, gap->start); + reason = SKB_DROP_REASON_DUP_FRAG; + goto discard; + } + gap->start = end; + if (gap->start >= gap->end) { + list_del(&gap->links); + kfree(gap); + } + goto keep; + } + + /* Is packet at the end of this gap? BTW, at this point we know + * the packet can't cover the entire gap. + */ + if (end >= gap->end) { + if (start >= gap->end) + continue; + if (end > gap->end) { + tt_record4("Packet overlaps gap end: id %d, start %d, end %d, gap_end %d", + rpc->id, start, end, gap->start); + reason = SKB_DROP_REASON_DUP_FRAG; + goto discard; + } + gap->end = start; + goto keep; + } + + /* Packet is in the middle of the gap; must split the gap. */ + gap2 = homa_gap_alloc(&gap->links, gap->start, start); + if (!gap2) { + tt_record2("Couldn't allocate gap for split for id %d (start %d): no memory", + rpc->id, end); + reason = SKB_DROP_REASON_NOMEM; + goto discard; + } + gap2->time = gap->time; + gap->start = end; + goto keep; } - if (h->retransmit) { + +discard: +#ifndef __STRIP__ /* See strip.py */ + if (h->retransmit) + INC_METRIC(resent_discards, 1); + else + INC_METRIC(packet_discards, 1); +#endif /* See strip.py */ + tt_record4("homa_add_packet discarding packet for id %d, offset %d, length %d, retransmit %d", + rpc->id, start, length, h->retransmit); + kfree_skb_reason(skb, reason); + return; + +keep: + __skb_queue_tail(&rpc->msgin.packets, skb); + rpc->msgin.bytes_remaining -= length; +#ifndef __STRIP__ /* See strip.py */ + if (h->retransmit) INC_METRIC(resent_packets_used, 1); - homa_freeze(rpc, PACKET_LOST, "Freezing because of lost " - "packet, id %d, peer 0x%x"); + if (homa_is_client(rpc->id)) { + INC_METRIC(client_response_bytes_done, length); + INC_METRIC(client_responses_done, + rpc->msgin.bytes_remaining == 0); + } else { + INC_METRIC(server_request_bytes_done, length); + INC_METRIC(server_requests_done, + rpc->msgin.bytes_remaining == 0); } - __skb_insert(skb, skb2, skb2->next, &rpc->msgin.packets); - rpc->msgin.bytes_remaining -= data_bytes; - rpc->msgin.num_skbs++; +#endif /* See strip.py */ } /** @@ -118,370 +288,357 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) * packet buffers to buffers in user space. * @rpc: RPC for which data should be copied. Must be locked by caller. * Return: Zero for success or a negative errno if there is an error. + * It is possible for the RPC to be freed while this function + * executes (it releases and reacquires the RPC lock). If that + * happens, -EINVAL will be returned and the state of @rpc + * will be RPC_DEAD. Clears the RPC_PKTS_READY bit in @rpc->flags + * if all available packets have been copied out. */ int homa_copy_to_user(struct homa_rpc *rpc) + __must_hold(rpc->bucket->lock) { #ifdef __UNIT_TEST__ #define MAX_SKBS 3 -#else -#define MAX_SKBS 10 -#endif +#else /* __UNIT_TEST__ */ +#define MAX_SKBS 20 +#endif /* __UNIT_TEST__ */ struct sk_buff *skbs[MAX_SKBS]; - int n = 0; /* Number of filled entries in skbs. */ +#ifndef __UPSTREAM__ /* See strip.py */ + int start_offset = 0; + int end_offset = 0; +#endif /* See strip.py */ int error = 0; - int count; - - /* Number of bytes that have already been copied to user space - * from the current packet. - */ - int copied_from_seg; + int n = 0; /* Number of filled entries in skbs. */ +#ifndef __STRIP__ /* See strip.py */ + u64 start; +#endif /* See strip.py */ + int i; /* Tricky note: we can't hold the RPC lock while we're actually * copying to user space, because (a) it's illegal to hold a spinlock * while copying to user space and (b) we'd like for homa_softirq * to add more packets to the RPC while we're copying these out. - * So, collect a bunch of chunks to copy, then release the lock, + * So, collect a bunch of packets to copy, then release the lock, * copy them, and reacquire the lock. */ while (true) { - struct sk_buff *skb = skb_peek(&rpc->msgin.packets); - struct data_header *h; - int i, seg_offset; - - if (!skb || (rpc->msgin.copied_out >= rpc->msgin.total_length)) - goto copy_out; - h = (struct data_header *) skb->data; - seg_offset = ntohl(h->seg.offset); - if (rpc->msgin.copied_out < seg_offset) { - /* The next data to copy hasn't yet been received; - * wait for more packets to arrive. - */ - goto copy_out; + struct sk_buff *skb; + + if (rpc->state == RPC_DEAD) { + error = -EINVAL; + break; } - BUG_ON(rpc->msgin.copied_out != seg_offset); - skbs[n] = skb; - n++; - skb_dequeue(&rpc->msgin.packets); - rpc->msgin.num_skbs--; - rpc->msgin.copied_out = seg_offset + ntohl(h->seg.segment_length); - if (n < MAX_SKBS) - continue; - -copy_out: - if (n == 0) + + skb = __skb_dequeue(&rpc->msgin.packets); + if (skb) { + skbs[n] = skb; + n++; + if (n < MAX_SKBS) + continue; + } + if (n == 0) { + clear_bit(RPC_PKTS_READY, &rpc->flags); break; - atomic_or(RPC_COPYING_TO_USER, &rpc->flags); + } + + /* At this point we've collected a batch of packets (or + * run out of packets); copy any available packets out to + * user space. + */ homa_rpc_unlock(rpc); tt_record1("starting copy to user space for id %d", - rpc->id); - - /* Each iteration of this loop copies (part of?) an skb - * to a contiguous range of buffer space. - */ - count = 0; - copied_from_seg = 0; - for (i = 0; i < n && !error; ) { - int skb_bytes, buf_bytes, next_copied; - char *dst; - struct iovec iov; + rpc->id); + + /* Each iteration of this loop copies out one skb. */ + for (i = 0; i < n; i++) { + struct homa_data_hdr *h = (struct homa_data_hdr *) + skbs[i]->data; + int pkt_length = homa_data_len(skbs[i]); + int offset = ntohl(h->seg.offset); + int buf_bytes, chunk_size; struct iov_iter iter; + int copied = 0; + char __user *dst; - skb = skbs[i]; - h = (struct data_header *) skb->data; - skb_bytes = ntohl(h->seg.segment_length) - copied_from_seg; - dst = homa_pool_get_buffer(rpc, - ntohl(h->seg.offset) + copied_from_seg, - &buf_bytes); - if (dst == NULL) { - error = -ENOMEM; - break; - } - if (buf_bytes < skb_bytes) { - if (buf_bytes == 0) { - /* skb seems to have data beyond the - * end of the message. - */ - break; + /* Each iteration of this loop copies to one + * user buffer. + */ + while (copied < pkt_length) { + chunk_size = pkt_length - copied; + dst = homa_pool_get_buffer(rpc, offset + copied, + &buf_bytes); + if (buf_bytes < chunk_size) { + if (buf_bytes == 0) { + /* skb has data beyond message + * end? + */ + break; + } + chunk_size = buf_bytes; } - skb_bytes = buf_bytes; - next_copied = copied_from_seg + skb_bytes; - } else { - i++; - next_copied = 0; + error = import_ubuf(READ, dst, chunk_size, + &iter); + if (error) + goto free_skbs; + error = skb_copy_datagram_iter(skbs[i], + sizeof(*h) + + copied, &iter, + chunk_size); + if (error) + goto free_skbs; + copied += chunk_size; } - BUG_ON(skb_bytes <= 0); - error = import_single_range(READ, dst, skb_bytes, &iov, - &iter); - if (error) - break; - error = skb_copy_datagram_iter(skb, - sizeof(*h) + copied_from_seg, &iter, - skb_bytes); - copied_from_seg = next_copied; - count += skb_bytes; +#ifndef __UPSTREAM__ /* See strip.py */ + if (end_offset == 0) { + start_offset = offset; + } else if (end_offset != offset) { + tt_record3("copied out bytes %d-%d for id %d", + start_offset, end_offset, rpc->id); + start_offset = offset; + } + end_offset = offset + pkt_length; +#endif /* See strip.py */ } - tt_record3("finished copying %d bytes for id %d, copied_out %d", - count, rpc->id, ntohl(h->seg.offset) - + ntohl(h->seg.segment_length)); - /* Free skbs. */ +free_skbs: +#ifndef __UPSTREAM__ /* See strip.py */ + if (end_offset != 0) { + tt_record3("copied out bytes %d-%d for id %d", + start_offset, end_offset, rpc->id); + end_offset = 0; + } +#endif /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ + start = homa_clock(); +#endif /* See strip.py */ for (i = 0; i < n; i++) - kfree_skb(skbs[i]); + consume_skb(skbs[i]); + INC_METRIC(skb_free_cycles, homa_clock() - start); + INC_METRIC(skb_frees, n); tt_record2("finished freeing %d skbs for id %d", - n, rpc->id); + n, rpc->id); n = 0; - homa_rpc_lock(rpc); - atomic_andnot(RPC_COPYING_TO_USER, &rpc->flags); + homa_rpc_lock_preempt(rpc); if (error) break; } +#ifndef __STRIP__ /* See strip.py */ if (error) tt_record2("homa_copy_to_user returning error %d for id %d", - -error, rpc->id); + -error, rpc->id); +#endif /* See strip.py */ return error; } /** - * homa_get_resend_range() - Given a message for which some input data - * is missing, find the first range of missing data. - * @msgin: Message for which not all granted data has been received. - * @resend: The @offset and @length fields of this structure will be - * filled in with information about the first missing range - * in @msgin. + * homa_dispatch_pkts() - Top-level function that processes a batch of packets, + * all related to the same RPC. + * @skb: First packet in the batch, linked through skb->next. */ -void homa_get_resend_range(struct homa_message_in *msgin, - struct resend_header *resend) +void homa_dispatch_pkts(struct sk_buff *skb) { - struct sk_buff *skb; - int missing_bytes; - /* This will eventually be the top of the first missing range. */ - int end_offset; - - if (msgin->total_length < 0) { - /* Haven't received any data for this message; request - * retransmission of just the first packet (the sender - * will send at least one full packet, regardless of - * the length below). - */ - resend->offset = 0; - resend->length = htonl(100); - return; - } - - end_offset = msgin->incoming; - - /* The code below handles the case where we've received data past - * msgin->incoming. In this case, end_offset should start off at - * the offset just after the last byte received. - */ - skb = skb_peek_tail(&msgin->packets); - if (skb) { - struct data_header *h = (struct data_header *) skb->data; - int data_end = ntohl(h->seg.offset) - + ntohl(h->seg.segment_length); - if (data_end > end_offset) - end_offset = data_end; - } - - missing_bytes = msgin->bytes_remaining - - (msgin->total_length - end_offset); - if (missing_bytes == 0) { - resend->offset = 0; - resend->length = 0; - return; - } - - /* Basic idea: walk backwards through the message's packets until - * we have accounted for all missing bytes; this will identify - * the first missing range. - */ - skb_queue_reverse_walk(&msgin->packets, skb) { - struct data_header *h = (struct data_header *) skb->data; - int offset = ntohl(h->seg.offset); - int pkt_length = ntohl(h->seg.segment_length); - int gap; - - if (pkt_length > (end_offset - offset)) - pkt_length = end_offset - offset; - gap = end_offset - (offset + pkt_length); - missing_bytes -= gap; - if (missing_bytes == 0) { - resend->offset = htonl(offset + pkt_length); - resend->length = htonl(gap); - return; +#ifdef __UNIT_TEST__ +#define MAX_ACKS 2 +#else /* __UNIT_TEST__ */ +#define MAX_ACKS 10 +#endif /* __UNIT_TEST__ */ + const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); + struct homa_data_hdr *h = (struct homa_data_hdr *)skb->data; + u64 id = homa_local_id(h->common.sender_id); + int dport = ntohs(h->common.dport); + struct homa_rpc *rpc = NULL; + struct homa_sock *hsk; + struct homa_net *hnet; + struct sk_buff *next; + + /* Find the appropriate socket.*/ + hnet = homa_net(dev_net(skb->dev)); + hsk = homa_sock_find(hnet, dport); + if (!hsk || (!homa_is_client(id) && !hsk->is_server)) { + if (skb_is_ipv6(skb)) + icmp6_send(skb, ICMPV6_DEST_UNREACH, + ICMPV6_PORT_UNREACH, 0, NULL, IP6CB(skb)); + else + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_PORT_UNREACH, 0); + tt_record3("Discarding packet(s) for unknown port %u, id %llu, type %d", + dport, homa_local_id(h->common.sender_id), + h->common.type); + while (skb) { + next = skb->next; + kfree_skb(skb); + skb = next; } - end_offset = offset; + if (hsk) + sock_put(&hsk->sock); + return; } - /* The first packet(s) are missing. */ - tt_record4("first packets missing, missing_bytes %d, copied_out %d, " - "incoming %d, length %d", - missing_bytes, msgin->copied_out, msgin->incoming, - msgin->total_length); - resend->offset = htonl(msgin->copied_out); - resend->length = htonl(missing_bytes); -} - -/** - * homa_pkt_dispatch() - Top-level function for handling an incoming packet. - * @skb: The incoming packet. This function takes ownership of the - * packet and will ensure that it is eventually freed. - * @hsk: Homa socket that owns the packet's destination port. This socket - * is not locked, but its existence is ensured for the life - * of this method. - * @lcache: Used to manage RPC locks; must be properly initialized by - * the caller, may be modified here. - * @delta: Pointer to a value that will be incremented or decremented - * to accumulate changes that need to be made to - * homa->total_incoming. - * - * Return: None. - */ -void homa_pkt_dispatch(struct sk_buff *skb, struct homa_sock *hsk, - struct homa_lcache *lcache, int *delta) -{ - struct common_header *h = (struct common_header *) skb->data; - const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); - struct homa_rpc *rpc; - __u64 id = homa_local_id(h->sender_id); + /* Each iteration through the following loop processes one packet. */ + for (; skb; skb = next) { + h = (struct homa_data_hdr *)skb->data; + next = skb->next; - /* If there is an ack in the packet, handle it. Must do this - * before locking the packet's RPC, since we may need to acquire - * (other) RPC locks to handle the acks. - */ - if (h->type == DATA) { - struct data_header *dh = (struct data_header *) h; - if (dh->seg.ack.client_id != 0) { - /* homa_rpc_acked may attempt to lock the RPC, so - * make sure we don't have an RPC locked. - */ - homa_lcache_release(lcache); - homa_rpc_acked(hsk, &saddr, &dh->seg.ack); + /* Relinquish the RPC lock temporarily if it's needed + * elsewhere. + */ + if (rpc) { + if (test_bit(APP_NEEDS_LOCK, &rpc->flags)) { + homa_rpc_unlock(rpc); + tt_record2("softirq released lock for id %d, flags 0x%x", + rpc->id, rpc->flags); + + /* This short spin is needed to ensure that the + * other thread gets the lock before this thread + * grabs it again below (the need for this + * was confirmed experimentally in 2/2025; + * without it, the handoff fails 20-25% of the + * time). Furthermore, the call to homa_spin + * seems to allow the other thread to acquire + * the lock more quickly. + */ + homa_spin(100); + homa_rpc_lock(rpc); + } } - } - /* Find and lock the RPC for this packet. */ - rpc = homa_lcache_get(lcache, id, &saddr, ntohs(h->sport)); - if (!rpc) { - /* To avoid deadlock, must release old RPC before locking new. */ - homa_lcache_release(lcache); - if (!homa_is_client(id)) { - /* We are the server for this RPC. */ - if (h->type == DATA) { - /* Create a new RPC if one doesn't already exist. */ - rpc = homa_rpc_new_server(hsk, &saddr, - (struct data_header *) h); - if (IS_ERR(rpc)) { - printk(KERN_WARNING "homa_pkt_dispatch " - "couldn't create " - "server rpc: error %lu", - -PTR_ERR(rpc)); - INC_METRIC(server_cant_create_rpcs, 1); - rpc = NULL; - goto discard; + /* If we don't already have an RPC, find it, lock it, + * and create a reference on it. + */ + if (!rpc) { + if (!homa_is_client(id)) { + /* We are the server for this RPC. */ + if (h->common.type == DATA) { + int created; + + /* Create a new RPC if one doesn't + * already exist. + */ + rpc = homa_rpc_alloc_server(hsk, &saddr, + h, + &created); + if (IS_ERR(rpc)) { + INC_METRIC(server_cant_create_rpcs, 1); + rpc = NULL; + goto discard; + } + } else { + rpc = homa_rpc_find_server(hsk, &saddr, + id); } - } else - rpc = homa_find_server_rpc(hsk, &saddr, - ntohs(h->sport), id); - - } else { - rpc = homa_find_client_rpc(hsk, id); - } - if (rpc) - homa_lcache_save(lcache, rpc); - } - if (unlikely(!rpc)) { - if ((h->type != CUTOFFS) && (h->type != NEED_ACK) - && (h->type != ACK) && (h->type != RESEND)) { - tt_record4("Discarding packet for unknown RPC, id %u, " - "type %d, peer 0x%x:%d", - id, h->type, - tt_addr(saddr), - ntohs(h->sport)); - if ((h->type != GRANT) || homa_is_client(id)) - INC_METRIC(unknown_rpcs, 1); - goto discard; + } else { + rpc = homa_rpc_find_client(hsk, id); + } + if (rpc) + homa_rpc_hold(rpc); } - } else { - if ((h->type == DATA) || (h->type == GRANT) - || (h->type == BUSY)) - rpc->silent_ticks = 0; - rpc->peer->outstanding_resends = 0; - if (hsk->homa->sync_freeze) { - hsk->homa->sync_freeze = 0; - if (!tt_frozen) { - struct freeze_header freeze; - tt_record2("Freezing timetrace because of " - "sync_freeze, id %d, peer 0x%x", - rpc->id, - tt_addr(rpc->peer->addr)); - tt_freeze(); - homa_xmit_control(FREEZE, &freeze, - sizeof(freeze), rpc); + if (unlikely(!rpc)) { +#ifndef __STRIP__ /* See strip.py */ + if (h->common.type != CUTOFFS && + h->common.type != NEED_ACK && +#else /* See strip.py */ + if (h->common.type != NEED_ACK && +#endif /* See strip.py */ + h->common.type != ACK && + h->common.type != RESEND) { + tt_record4("Discarding packet for unknown RPC, id %u, type %d, peer 0x%x:%d", + id, h->common.type, tt_addr(saddr), + ntohs(h->common.sport)); +#ifndef __STRIP__ /* See strip.py */ + if (h->common.type != GRANT || + homa_is_client(id)) + INC_METRIC(unknown_rpcs, 1); +#endif /* See strip.py */ + goto discard; } + } else { + if (h->common.type == DATA || +#ifndef __STRIP__ /* See strip.py */ + h->common.type == GRANT || +#endif /* See strip.py */ + h->common.type == BUSY) + rpc->silent_ticks = 0; + rpc->peer->outstanding_resends = 0; } - } - switch (h->type) { - case DATA: - homa_data_pkt(skb, rpc, lcache, delta); - INC_METRIC(packets_received[DATA - DATA], 1); - if (hsk->dead_skbs >= 2*hsk->homa->dead_buffs_limit) { - /* We get here if neither homa_wait_for_message - * nor homa_timer can keep up with reaping dead - * RPCs. See reap.txt for details. + switch (h->common.type) { + case DATA: + homa_data_pkt(skb, rpc); + INC_METRIC(packets_received[DATA - DATA], 1); + break; +#ifndef __STRIP__ /* See strip.py */ + case GRANT: + INC_METRIC(packets_received[GRANT - DATA], 1); + homa_grant_pkt(skb, rpc); + break; +#endif /* See strip.py */ + case RESEND: + INC_METRIC(packets_received[RESEND - DATA], 1); + homa_resend_pkt(skb, rpc, hsk); + break; + case RPC_UNKNOWN: + INC_METRIC(packets_received[RPC_UNKNOWN - DATA], 1); + homa_rpc_unknown_pkt(skb, rpc); + break; + case BUSY: + INC_METRIC(packets_received[BUSY - DATA], 1); + tt_record2("received BUSY for id %d, peer 0x%x", + id, tt_addr(rpc->peer->addr)); + /* Nothing to do for these packets except reset + * silent_ticks, which happened above. */ - uint64_t start = get_cycles(); - - /* Must unlock to avoid self-deadlock in rpc_reap. */ - homa_lcache_release(lcache); - rpc = NULL; - tt_record("homa_data_pkt calling homa_rpc_reap"); - homa_rpc_reap(hsk, hsk->homa->reap_limit); - INC_METRIC(data_pkt_reap_cycles, get_cycles() - start); + goto discard; +#ifndef __STRIP__ /* See strip.py */ + case CUTOFFS: + INC_METRIC(packets_received[CUTOFFS - DATA], 1); + homa_cutoffs_pkt(skb, hsk); + break; +#endif /* See strip.py */ + case NEED_ACK: + INC_METRIC(packets_received[NEED_ACK - DATA], 1); + homa_need_ack_pkt(skb, hsk, rpc); + break; + case ACK: + INC_METRIC(packets_received[ACK - DATA], 1); + homa_ack_pkt(skb, hsk, rpc); + break; + default: + INC_METRIC(unknown_packet_types, 1); + goto discard; } - break; - case GRANT: - INC_METRIC(packets_received[GRANT - DATA], 1); - homa_grant_pkt(skb, rpc); - break; - case RESEND: - INC_METRIC(packets_received[RESEND - DATA], 1); - homa_resend_pkt(skb, rpc, hsk); - break; - case UNKNOWN: - INC_METRIC(packets_received[UNKNOWN - DATA], 1); - homa_unknown_pkt(skb, rpc); - break; - case BUSY: - INC_METRIC(packets_received[BUSY - DATA], 1); - tt_record2("received BUSY for id %d, peer 0x%x", - id, tt_addr(rpc->peer->addr)); - /* Nothing to do for these packets except reset silent_ticks, - * which happened above. - */ - goto discard; - case CUTOFFS: - INC_METRIC(packets_received[CUTOFFS - DATA], 1); - homa_cutoffs_pkt(skb, hsk); - break; - case NEED_ACK: - INC_METRIC(packets_received[NEED_ACK - DATA], 1); - homa_need_ack_pkt(skb, hsk, rpc); - break; - case ACK: - INC_METRIC(packets_received[ACK - DATA], 1); - homa_ack_pkt(skb, hsk, rpc, lcache); - break; - default: - INC_METRIC(unknown_packet_types, 1); - goto discard; + continue; + +discard: + kfree_skb(skb); + } + if (rpc) { + IF_NO_STRIP(homa_grant_check_rpc(rpc)); + homa_rpc_put(rpc); + homa_rpc_unlock(rpc); } - return; - discard: - kfree_skb(skb); + /* We need to reap dead RPCs here under two conditions: + * 1. The socket has hit its limit on tx buffer space and threads are + * blocked waiting for skbs to be released. + * 2. A large number of dead RPCs have accumulated, and it seems + * that the reaper isn't keeping up when invoked only at + * "convenient" times (see "RPC Reaping Strategy" in homa_rpc_reap + * code for details). + */ + if (hsk->dead_skbs > 0) { + int waiting_for_wmem = test_bit(SOCK_NOSPACE, + &hsk->sock.sk_socket->flags); + if (waiting_for_wmem || + hsk->dead_skbs >= 2 * hsk->homa->dead_buffs_limit) { + IF_NO_STRIP(u64 start = homa_clock()); + + tt_record("homa_dispatch_pkts calling homa_rpc_reap"); + homa_rpc_reap(hsk, waiting_for_wmem); + INC_METRIC(data_pkt_reap_cycles, homa_clock() - start); + } + } + sock_put(&hsk->sock); } /** @@ -489,61 +646,82 @@ void homa_pkt_dispatch(struct sk_buff *skb, struct homa_sock *hsk, * @skb: Incoming packet; size known to be large enough for the header. * This function now owns the packet. * @rpc: Information about the RPC corresponding to this packet. - * @lcache: @rpc must be stored here; released if needed to unlock @rpc. - * @delta: Pointer to a value that will be incremented or decremented - * to accumulate changes that need to be made to homa->total_incoming. - * - * Return: Zero means the function completed successfully. Nonzero means - * that the RPC had to be unlocked and deleted because the socket has been - * shut down; the caller should not access the RPC anymore. + * Must be locked by the caller. */ -void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc, - struct homa_lcache *lcache, int *delta) +void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) + __must_hold(rpc->bucket->lock) { + struct homa_data_hdr *h = (struct homa_data_hdr *)skb->data; +#ifndef __STRIP__ /* See strip.py */ struct homa *homa = rpc->hsk->homa; - struct data_header *h = (struct data_header *) skb->data; - int old_remaining; +#endif /* See strip.py */ tt_record4("incoming data packet, id %d, peer 0x%x, offset %d/%d", - homa_local_id(h->common.sender_id), - tt_addr(rpc->peer->addr), ntohl(h->seg.offset), - ntohl(h->message_length)); + homa_local_id(h->common.sender_id), + tt_addr(rpc->peer->addr), ntohl(h->seg.offset), + ntohl(h->message_length)); - if (rpc->state != RPC_INCOMING) { - if (homa_is_client(rpc->id)) { - if (unlikely(rpc->state != RPC_OUTGOING)) - goto discard; - INC_METRIC(responses_received, 1); - rpc->state = RPC_INCOMING; - } else { - if (unlikely(rpc->msgin.total_length >= 0)) - goto discard; - } + if (h->ack.client_id) { + const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); + + homa_rpc_unlock(rpc); + homa_rpc_acked(rpc->hsk, &saddr, &h->ack); + homa_rpc_lock(rpc); + if (rpc->state == RPC_DEAD) + goto discard; } - if (rpc->msgin.total_length < 0) { - /* First data packet for message; initialize. */ + if (rpc->state != RPC_INCOMING && homa_is_client(rpc->id)) { + if (unlikely(rpc->state != RPC_OUTGOING)) + goto discard; + INC_METRIC(responses_received, 1); + rpc->state = RPC_INCOMING; +#ifndef __STRIP__ /* See strip.py */ tt_record2("Incoming message for id %d has %d unscheduled bytes", - rpc->id, ntohl(h->incoming)); - homa_message_in_init(&rpc->msgin, ntohl(h->message_length), - ntohl(h->incoming)); - *delta += rpc->msgin.incoming; + rpc->id, ntohl(h->incoming)); +#endif /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ + if (homa_message_in_init(rpc, ntohl(h->message_length), + ntohl(h->incoming)) != 0) +#else /* See strip.py */ + if (homa_message_in_init(rpc, ntohl(h->message_length)) != 0) +#endif /* See strip.py */ + goto discard; + } else if (rpc->state != RPC_INCOMING) { + /* Must be server; note that homa_rpc_alloc_server already + * initialized msgin and allocated buffers. + */ + if (unlikely(rpc->msgin.length >= 0)) + goto discard; + } + + if (rpc->msgin.num_bpages == 0) { + /* Drop packets that arrive when we can't allocate buffer + * space. If we keep them around, packet buffer usage can + * exceed available cache space, resulting in poor + * performance. + */ +#ifndef __STRIP__ /* See strip.py */ + tt_record4("Dropping packet because no buffer space available: id %d, offset %d, length %d, old incoming %d", + rpc->id, ntohl(h->seg.offset), homa_data_len(skb), + rpc->msgin.granted); +#else /* See strip.py */ + tt_record3("Dropping packet because no buffer space available: id %d, offset %d, length %d", + rpc->id, ntohl(h->seg.offset), homa_data_len(skb)); +#endif /* See strip.py */ + INC_METRIC(dropped_data_no_bufs, homa_data_len(skb)); + goto discard; } - old_remaining = rpc->msgin.bytes_remaining; homa_add_packet(rpc, skb); - *delta -= old_remaining - rpc->msgin.bytes_remaining; - if ((ntohl(h->seg.offset) == rpc->msgin.copied_out) - && !(atomic_read(&rpc->flags) & RPC_PKTS_READY)) { - atomic_or(RPC_PKTS_READY, &rpc->flags); - homa_sock_lock(rpc->hsk, "homa_data_pkt"); + if (skb_queue_len(&rpc->msgin.packets) != 0 && + !test_bit(RPC_PKTS_READY, &rpc->flags)) { + set_bit(RPC_PKTS_READY, &rpc->flags); homa_rpc_handoff(rpc); - homa_sock_unlock(rpc->hsk); } - if (rpc->msgin.scheduled) - homa_check_grantable(homa, rpc); +#ifndef __STRIP__ /* See strip.py */ if (ntohs(h->cutoff_version) != homa->cutoff_version) { /* The sender has out-of-date cutoffs. Note: we may need * to resend CUTOFFS packets if one gets lost, but we don't @@ -554,7 +732,7 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc, * packet. */ if (jiffies != rpc->peer->last_update_jiffies) { - struct cutoffs_header h2; + struct homa_cutoffs_hdr h2; int i; for (i = 0; i < HOMA_MAX_PRIORITIES; i++) { @@ -566,28 +744,32 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc, rpc->peer->last_update_jiffies = jiffies; } } +#endif /* See strip.py */ return; - discard: +discard: kfree_skb(skb); + UNIT_LOG("; ", "homa_data_pkt discarded packet"); } +#ifndef __STRIP__ /* See strip.py */ /** * homa_grant_pkt() - Handler for incoming GRANT packets * @skb: Incoming packet; size already verified large enough for header. * This function now owns the packet. * @rpc: Information about the RPC corresponding to this packet. + * Must be locked by caller. */ void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc) + __must_hold(rpc->bucket->lock) { - struct grant_header *h = (struct grant_header *) skb->data; + struct homa_grant_hdr *h = (struct homa_grant_hdr *)skb->data; + int new_offset = ntohl(h->offset); - tt_record3("processing grant for id %llu, offset %d, priority %d", - homa_local_id(h->common.sender_id), ntohl(h->offset), - h->priority); + tt_record4("processing grant for id %llu, offset %d, priority %d, increment %d", + homa_local_id(h->common.sender_id), ntohl(h->offset), + h->priority, new_offset - rpc->msgout.granted); if (rpc->state == RPC_OUTGOING) { - int new_offset = ntohl(h->offset); - if (new_offset > rpc->msgout.granted) { rpc->msgout.granted = new_offset; if (new_offset > rpc->msgout.length) @@ -596,8 +778,9 @@ void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc) rpc->msgout.sched_priority = h->priority; homa_xmit_data(rpc, false); } - kfree_skb(skb); + consume_skb(skb); } +#endif /* See strip.py */ /** * homa_resend_pkt() - Handler for incoming RESEND packets @@ -609,110 +792,140 @@ void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc) * @hsk: Socket on which the packet was received. */ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, - struct homa_sock *hsk) + struct homa_sock *hsk) + __must_hold(rpc->bucket->lock) { - struct resend_header *h = (struct resend_header *) skb->data; - const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); - struct busy_header busy; - - if (rpc == NULL) { - tt_record4("resend request for unknown id %d, peer 0x%x:%d, " - "offset %d; responding with UNKNOWN", - homa_local_id(h->common.sender_id), - tt_addr(saddr), ntohs(h->common.sport), - ntohl(h->offset)); + struct homa_resend_hdr *h = (struct homa_resend_hdr *)skb->data; + int offset = ntohl(h->offset); + int length = ntohl(h->length); + int end = offset + length; + struct homa_busy_hdr busy; + int tx_end; + + if (!rpc) { + tt_record4("resend request for unknown id %d, peer 0x%x:%d, offset %d; responding with RPC_UNKNOWN", + homa_local_id(h->common.sender_id), + tt_addr(skb_canonical_ipv6_saddr(skb)), + ntohs(h->common.sport), ntohl(h->offset)); homa_xmit_unknown(skb, hsk); goto done; } +#ifndef __STRIP__ /* See strip.py */ tt_record4("resend request for id %llu, offset %d, length %d, prio %d", - rpc->id, ntohl(h->offset), ntohl(h->length), - h->priority); - - if (!homa_is_client(rpc->id)) { - /* We are the server for this RPC. */ - if (rpc->state != RPC_OUTGOING) { - tt_record2("sending BUSY from resend, id %d, state %d", - rpc->id, rpc->state); - homa_xmit_control(BUSY, &busy, sizeof(busy), rpc); - goto done; - } + rpc->id, offset, length, h->priority); +#else /* See strip.py */ + tt_record3("resend request for id %llu, offset %d, length %d", + rpc->id, offset, length); +#endif /* See strip.py */ + + tx_end = homa_rpc_tx_end(rpc); + if (!homa_is_client(rpc->id) && rpc->state != RPC_OUTGOING) { + /* We are the server for this RPC and don't yet have a + * response message, so send BUSY to keep the client + * waiting. + */ + tt_record2("sending BUSY from resend, id %d, state %d", + rpc->id, rpc->state); + homa_xmit_control(BUSY, &busy, sizeof(busy), rpc); + goto done; } - if (rpc->msgout.next_xmit_offset < rpc->msgout.granted) { - /* We have chosen not to transmit data from this message; - * send BUSY instead. + + if (length == -1) + end = tx_end; + +#ifndef __STRIP__ /* See strip.py */ + homa_resend_data(rpc, offset, (end > tx_end) ? tx_end : end, + h->priority); + + if (end > rpc->msgout.granted) { + /* It appears that a grant packet was lost; assume that + * any data requested in the RESEND must have been + * granted previously. */ - tt_record3("sending BUSY from resend, id %d, offset %d, " - "granted %d", rpc->id, - rpc->msgout.next_xmit_offset, - rpc->msgout.granted); + rpc->msgout.granted = end; + if (rpc->msgout.granted > rpc->msgout.length) + rpc->msgout.granted = rpc->msgout.length; + homa_xmit_data(rpc, false); + } +#else /* See strip.py */ + homa_resend_data(rpc, offset, (end > tx_end) ? tx_end : end); +#endif /* See strip.py */ + + if (offset >= tx_end) { + /* We have chosen not to transmit any of the requested data; + * send BUSY so the receiver knows we are alive. + */ + tt_record3("sending BUSY from resend, id %d, offset %d, tx_end %d", + rpc->id, offset, tx_end); homa_xmit_control(BUSY, &busy, sizeof(busy), rpc); - } else { - if (ntohl(h->length) == 0) { - /* This RESEND is from a server just trying to determine - * whether the client still cares about the RPC; return - * BUSY so the server doesn't time us out. - */ - homa_xmit_control(BUSY, &busy, sizeof(busy), rpc); - } - homa_resend_data(rpc, ntohl(h->offset), - ntohl(h->offset) + ntohl(h->length), - h->priority); + goto done; } - done: - kfree_skb(skb); +done: + consume_skb(skb); } /** - * homa_unknown_pkt() - Handler for incoming UNKNOWN packets. + * homa_rpc_unknown_pkt() - Handler for incoming RPC_UNKNOWN packets. * @skb: Incoming packet; size known to be large enough for the header. * This function now owns the packet. - * @rpc: Information about the RPC corresponding to this packet. + * @rpc: Information about the RPC corresponding to this packet. Must + * be locked by caller. */ -void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc) +void homa_rpc_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc) + __must_hold(rpc->bucket->lock) { tt_record3("Received unknown for id %llu, peer %x:%d", - rpc->id, tt_addr(rpc->peer->addr), rpc->dport); + rpc->id, tt_addr(rpc->peer->addr), rpc->dport); if (homa_is_client(rpc->id)) { if (rpc->state == RPC_OUTGOING) { + int tx_end = homa_rpc_tx_end(rpc); + /* It appears that everything we've already transmitted * has been lost; retransmit it. */ - tt_record4("Restarting id %d to server 0x%x:%d, " - "lost %d bytes", - rpc->id, tt_addr(rpc->peer->addr), - rpc->dport, - rpc->msgout.next_xmit_offset); - homa_freeze(rpc, RESTART_RPC, "Freezing because of " - "RPC restart, id %d, peer 0x%x"); - homa_resend_data(rpc, 0, rpc->msgout.next_xmit_offset, - homa_unsched_priority(rpc->hsk->homa, - rpc->peer, rpc->msgout.length)); + tt_record4("Restarting id %d to server 0x%x:%d, lost %d bytes", + rpc->id, tt_addr(rpc->peer->addr), + rpc->dport, tx_end); +#ifndef __STRIP__ /* See strip.py */ + homa_freeze(rpc, RESTART_RPC, + "Freezing because of RPC restart, id %d, peer 0x%x"); + homa_resend_data(rpc, 0, tx_end, + homa_unsched_priority(rpc->hsk->homa, + rpc->peer, + rpc->msgout.length)); +#else /* See strip.py */ + homa_resend_data(rpc, 0, tx_end); +#endif /* See strip.py */ goto done; } - - printk(KERN_ERR "Received unknown for RPC id %llu, peer %s:%d " - "in bogus state %d; discarding unknown\n", - rpc->id, homa_print_ipv6_addr(&rpc->peer->addr), - rpc->dport, rpc->state); - tt_record4("Discarding unknown for RPC id %d, peer 0x%x:%d: " - "bad state %d", - rpc->id, tt_addr(rpc->peer->addr), rpc->dport, - rpc->state); +#ifndef __STRIP__ /* See strip.py */ + pr_err("Received unknown for RPC id %llu, peer %s:%d in bogus state %d; discarding unknown\n", + rpc->id, homa_print_ipv6_addr(&rpc->peer->addr), + rpc->dport, rpc->state); +#endif /* See strip.py */ + tt_record4("Discarding unknown for RPC id %d, peer 0x%x:%d: bad state %d", + rpc->id, tt_addr(rpc->peer->addr), rpc->dport, + rpc->state); +#ifndef __STRIP__ /* See strip.py */ } else { if (rpc->hsk->homa->verbose) - printk(KERN_NOTICE "Freeing rpc id %llu from client " - "%s:%d: unknown to client", - rpc->id, - homa_print_ipv6_addr(&rpc->peer->addr), - rpc->dport); - homa_rpc_free(rpc); + pr_notice("Ending rpc id %llu from client %s:%d: unknown to client", + rpc->id, + homa_print_ipv6_addr(&rpc->peer->addr), + rpc->dport); + homa_rpc_end(rpc); INC_METRIC(server_rpcs_unknown, 1); +#else /* See strip.py */ + } else { + homa_rpc_end(rpc); +#endif /* See strip.py */ } done: - kfree_skb(skb); + consume_skb(skb); } +#ifndef __STRIP__ /* See strip.py */ /** * homa_cutoffs_pkt() - Handler for incoming CUTOFFS packets * @skb: Incoming packet; size already verified large enough for header. @@ -721,20 +934,22 @@ void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc) */ void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk) { + struct homa_cutoffs_hdr *h = (struct homa_cutoffs_hdr *)skb->data; const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); + struct homa_peer *peer; int i; - struct cutoffs_header *h = (struct cutoffs_header *) skb->data; - struct homa_peer *peer = homa_peer_find(&hsk->homa->peers, - &saddr, &hsk->inet); + peer = homa_peer_get(hsk, &saddr); if (!IS_ERR(peer)) { peer->unsched_cutoffs[0] = INT_MAX; - for (i = 1; i unsched_cutoffs[i] = ntohl(h->unsched_cutoffs[i]); peer->cutoff_version = h->cutoff_version; + homa_peer_release(peer); } - kfree_skb(skb); + consume_skb(skb); } +#endif /* See strip.py */ /** * homa_need_ack_pkt() - Handler for incoming NEED_ACK packets @@ -745,25 +960,29 @@ void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk) * RPC exists. The RPC has been locked by the caller. */ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, - struct homa_rpc *rpc) + struct homa_rpc *rpc) + __must_hold(rpc->bucket->lock) { - struct common_header *h = (struct common_header *) skb->data; + struct homa_common_hdr *h = (struct homa_common_hdr *)skb->data; const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); - __u64 id = homa_local_id(h->sender_id); - struct ack_header ack; + u64 id = homa_local_id(h->sender_id); + struct homa_ack_hdr ack; struct homa_peer *peer; tt_record1("Received NEED_ACK for id %d", id); - /* Return if it's not safe for the peer to purge its state + /* Don't ack if it's not safe for the peer to purge its state * for this RPC (the RPC still exists and we haven't received * the entire response), or if we can't find peer info. */ - if ((rpc != NULL) && ((rpc->state != RPC_INCOMING) - || rpc->msgin.bytes_remaining)) { + if (rpc && (rpc->state != RPC_INCOMING || + rpc->msgin.bytes_remaining)) { + tt_record3("NEED_ACK arrived for id %d before message received, state %d, remaining %d", + rpc->id, rpc->state, rpc->msgin.bytes_remaining); + homa_request_retrans(rpc); goto done; } else { - peer = homa_peer_find(&hsk->homa->peers, &saddr, &hsk->inet); + peer = homa_peer_get(hsk, &saddr); if (IS_ERR(peer)) goto done; } @@ -775,15 +994,18 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, ack.common.type = ACK; ack.common.sport = h->dport; ack.common.dport = h->sport; + IF_NO_STRIP(homa_set_hijack(&ack.common)); ack.common.sender_id = cpu_to_be64(id); ack.num_acks = htons(homa_peer_get_acks(peer, - NUM_PEER_UNACKED_IDS, ack.acks)); + HOMA_MAX_ACKS_PER_PKT, + ack.acks)); __homa_xmit_control(&ack, sizeof(ack), peer, hsk); - tt_record3("Responded to NEED_ACK for id %d, peer %0x%x with %d " - "other acks", id, tt_addr(saddr), ntohs(ack.num_acks)); + tt_record3("Responded to NEED_ACK for id %d, peer %0x%x with %d other acks", + id, tt_addr(saddr), ntohs(ack.num_acks)); + homa_peer_release(peer); - done: - kfree_skb(skb); +done: + consume_skb(skb); } /** @@ -792,1018 +1014,313 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, * This function now owns the packet. * @hsk: Socket on which the packet was received. * @rpc: The RPC named in the packet header, or NULL if no such - * RPC exists. The RPC has been locked by the caller and - * recorded in @lcache. - * @lcache: Will be released here to unlock the RPC. + * RPC exists. The RPC lock will be dead on return. */ void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, - struct homa_rpc *rpc, struct homa_lcache *lcache) + struct homa_rpc *rpc) + __must_hold(rpc->bucket->lock) { - struct ack_header *h = (struct ack_header *) skb->data; const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); + struct homa_ack_hdr *h = (struct homa_ack_hdr *)skb->data; int i, count; - if (rpc != NULL) { - homa_rpc_free(rpc); - homa_lcache_release(lcache); + if (rpc) { + tt_record1("homa_ack_pkt freeing rpc id %d", rpc->id); + homa_rpc_end(rpc); } count = ntohs(h->num_acks); - for (i = 0; i < count; i++) - homa_rpc_acked(hsk, &saddr, &h->acks[i]); - tt_record3("ACK received for id %d, peer 0x%x, with %d other acks", - homa_local_id(h->common.sender_id), - tt_addr(saddr), count); - kfree_skb(skb); -} - -/** - * homa_check_grantable() - This function ensures that an RPC is on a - * grantable list if appropriate, and not on one otherwise. It also adjusts - * the position of the RPC upward on its list, if needed. - * @homa: Overall data about the Homa protocol implementation. - * @rpc: RPC to check; typically the status of this RPC has changed - * in a way that may affect its grantability (e.g. a packet - * just arrived for it). Must be locked. - */ -void homa_check_grantable(struct homa *homa, struct homa_rpc *rpc) -{ - struct homa_rpc *candidate; - struct homa_peer *peer = rpc->peer; - struct homa_peer *peer_cand; - struct homa_message_in *msgin = &rpc->msgin; - - /* No need to do anything unless this message is ready for more - * grants. - */ - if (((rpc->msgin.incoming - (rpc->msgin.total_length - - rpc->msgin.bytes_remaining)) >= homa->rtt_bytes) - || (rpc->msgin.incoming >= rpc->msgin.total_length)) - return; - - homa_grantable_lock(homa); - /* Note: must check incoming again: it might have changed. */ - if ((rpc->state == RPC_DEAD) || (rpc->msgin.incoming - >= rpc->msgin.total_length)) { - homa_grantable_unlock(homa); - return; - } - - /* Make sure this message is in the right place in the grantable_rpcs - * list for its peer. - */ - if (list_empty(&rpc->grantable_links)) { - /* Message not yet tracked; add it in priority order to - * the peer's list. - */ - rpc->msgin.birth = get_cycles(); - list_for_each_entry(candidate, &peer->grantable_rpcs, - grantable_links) { - if (candidate->msgin.bytes_remaining - > msgin->bytes_remaining) { - list_add_tail(&rpc->grantable_links, - &candidate->grantable_links); - goto position_peer; - } - } - list_add_tail(&rpc->grantable_links, &peer->grantable_rpcs); - } else while (rpc != list_first_entry(&peer->grantable_rpcs, - struct homa_rpc, grantable_links)) { - /* Message is on the list, but its priority may have - * increased because of the recent packet arrival. If so, - * adjust its position in the list. - */ - candidate = list_prev_entry(rpc, grantable_links); - /* Fewer remaining bytes wins: */ - if (candidate->msgin.bytes_remaining < msgin->bytes_remaining) - goto position_peer; - /* Tie-breaker: oldest wins */ - if (candidate->msgin.bytes_remaining == msgin->bytes_remaining) { - if (candidate->msgin.birth <= msgin->birth) { - goto position_peer; - } - } - __list_del_entry(&candidate->grantable_links); - list_add(&candidate->grantable_links, &rpc->grantable_links); - } - - position_peer: - /* At this point rpc is positioned correctly on the list for its peer. - * However, the peer may need to be added to, or moved upward on, - * homa->grantable_peers. - */ - if (list_empty(&peer->grantable_links)) { - /* Must add peer to the overall Homa list. */ - homa->num_grantable_peers++; - list_for_each_entry(peer_cand, &homa->grantable_peers, - grantable_links) { - candidate = list_first_entry(&peer_cand->grantable_rpcs, - struct homa_rpc, grantable_links); - if ((candidate->msgin.bytes_remaining - > msgin->bytes_remaining) - || ((candidate->msgin.bytes_remaining - == msgin->bytes_remaining) - && (candidate->msgin.birth - > msgin->birth))) { - list_add_tail(&peer->grantable_links, - &peer_cand->grantable_links); - goto done; - } + if (count > 0) { + if (rpc) { + /* Must temporarily release rpc's lock because + * homa_rpc_acked needs to acquire RPC locks. + */ + homa_rpc_unlock(rpc); + for (i = 0; i < count; i++) + homa_rpc_acked(hsk, &saddr, &h->acks[i]); + homa_rpc_lock(rpc); + } else { + for (i = 0; i < count; i++) + homa_rpc_acked(hsk, &saddr, &h->acks[i]); } - list_add_tail(&peer->grantable_links, &homa->grantable_peers); - goto done; } - /* The peer is on Homa's list, but it may need to move upward. */ - while (peer != list_first_entry(&homa->grantable_peers, - struct homa_peer, grantable_links)) { - struct homa_peer *prev_peer = list_prev_entry( - peer, grantable_links); - candidate = list_first_entry(&prev_peer->grantable_rpcs, - struct homa_rpc, grantable_links); - if ((candidate->msgin.bytes_remaining < msgin->bytes_remaining) - || ((candidate->msgin.bytes_remaining - == msgin->bytes_remaining) - && (candidate->msgin.birth <= msgin->birth))) - goto done; - __list_del_entry(&prev_peer->grantable_links); - list_add(&prev_peer->grantable_links, &peer->grantable_links); - } - - done: - homa_grantable_unlock(homa); + tt_record3("ACK received for id %d, peer 0x%x, with %d other acks", + homa_local_id(h->common.sender_id), tt_addr(saddr), count); + consume_skb(skb); } /** - * homa_send_grants() - This function checks to see whether it is - * appropriate to send grants and, if so, it sends them. - * @homa: Overall data about the Homa protocol implementation. + * homa_wait_private() - Waits until the response has been received for + * a specific RPC or the RPC has failed with an error. + * @rpc: RPC to wait for; an error will be returned if the RPC is + * not a client RPC or not private. Must be locked by caller. + * @nonblocking: Nonzero means return immediately if @rpc not ready. + * Return: 0 means that @rpc is ready for attention: either its response + * has been received or it has an unrecoverable error such as + * ETIMEDOUT (in rpc->error). Nonzero means some other error + * (such as EINTR or EINVAL) occurred before @rpc became ready + * for attention; in this case the return value is a negative + * errno. */ -void homa_send_grants(struct homa *homa) +int homa_wait_private(struct homa_rpc *rpc, int nonblocking) + __must_hold(rpc->bucket->lock) { - /* Some overall design notes: - * - Grant to multiple messages, as long as we can keep - * homa->total_incoming under homa->max_incoming bytes. - * - Ideally, each message should use a different priority level, - * determined by bytes_remaining (fewest bytes_remaining gets the - * highest priority). If there aren't enough scheduled priority - * levels for all of the messages, then the lowest level gets - * shared by multiple messages. - * - If there are fewer messages than priority levels, then we use - * the lowest available levels (new higher-priority messages can - * use the higher levels to achieve instantaneous preemption). - * - We only grant to one message for a given host (there's no - * point in granting to multiple, since the host will only send - * the highest priority one). - */ - struct homa_rpc *candidate; - struct homa_peer *peer, *temp; - int rank, i, window; - __u64 start; - - /* The variables below keep track of grants we need to send; - * don't send any until the very end, and release the lock - * first. - */ -#ifdef __UNIT_TEST__ - extern int mock_max_grants; -#define MAX_GRANTS mock_max_grants -#else -#define MAX_GRANTS 10 -#endif - struct grant_header grants[MAX_GRANTS]; - struct homa_rpc *rpcs[MAX_GRANTS]; - int num_grants = 0; - - /* How many more bytes we can grant before hitting the limit. */ - int available = homa->max_incoming - atomic_read(&homa->total_incoming); - - /* Total bytes in additional grants that we've given out so far. */ - int granted_bytes = 0; - - /* Make a local copy of homa->grantable_peers, since that variable - * could change during this function. - */ - int num_grantable_peers = homa->num_grantable_peers; - if ((num_grantable_peers == 0) || (available <= 0)) { - return; - } - - /* Compute the window (how much granted-but-not-received data there - * can be for each message. This will always be at least rtt_bytes, - * but if there aren't enough messages to consume all of - * max_incoming, then increase the window size to use it up - * (except, keep rtt_bytes in reserve so we can fully grant - * a new high-priority message). - */ - if (homa->max_grant_window == 0) { - window = homa->rtt_bytes; - } else { - /* Experimental: compute the window (how much granted-but-not- - * received data there can be for any given message. This will - * always be at least rtt_bytes, but if there aren't enough - * messages to consume all of max_incoming, then increase - * the window size to use it up (except, keep rtt_bytes in - * reserve so we can fully grant a new high-priority message). - * This technique is risky because it could use up almost - * all the grants on a single non-responsive host, which - * could result in underutilization of our downlink if that - * host stops responding. - */ - window = (homa->max_incoming - - homa->rtt_bytes)/num_grantable_peers; - if (window > homa->max_grant_window) - window = homa->max_grant_window; - if (window < homa->rtt_bytes) - window = homa->rtt_bytes; - } - - start = get_cycles(); - homa_grantable_lock(homa); - - /* Figure out which messages should receive additional grants. Consider - * only a single (highest-priority) entry for each peer. + struct homa_interest interest; +#ifndef __STRIP__ /* See strip.py */ + int avail_immediately = 1; + int blocked = 0; +#endif /* See strip.py */ + int result; + + if (!test_bit(RPC_PRIVATE, &rpc->flags)) + return -EINVAL; + + /* Each iteration through this loop waits until rpc needs attention + * in some way (e.g. packets have arrived), then deals with that need + * (e.g. copy to user space). It may take many iterations until the + * RPC is ready for the application. */ - rank = 0; - list_for_each_entry_safe(peer, temp, &homa->grantable_peers, - grantable_links) { - int extra_levels, priority; - int received, new_grant, increment; - struct grant_header *grant; - - rank++; - candidate = list_first_entry(&peer->grantable_rpcs, - struct homa_rpc, grantable_links); - - /* Tricky synchronization issue: homa_data_pkt may be - * updating bytes_remaining while we're working here. - * So, we only read it once, right now, and we only - * make updates to total_incoming based on changes - * to msgin.incoming (not bytes_remaining). homa_data_pkt - * will update total_incoming based on bytes_remaining - * but not incoming. - */ - received = (candidate->msgin.total_length - - candidate->msgin.bytes_remaining); - new_grant = received + window; - if (new_grant > candidate->msgin.total_length) - new_grant = candidate->msgin.total_length; - increment = new_grant - candidate->msgin.incoming; - tt_record3("grant info: id %d, received %d, incoming %d", - candidate->id, received, - candidate->msgin.incoming); - if (increment <= 0) - continue; - if (available <= 0) + while (1) { + result = 0; + if (!rpc->error) + rpc->error = homa_copy_to_user(rpc); + if (rpc->error) { + IF_NO_STRIP(avail_immediately = 0); break; - if (increment > available) { - increment = available; - new_grant = candidate->msgin.incoming + increment; } - - /* The following line is needed to prevent spurious resends. - * Without it, if the timer fires right after we send the - * grant, it might think the RPC is slow and request a - * resend (until we send the grant, timeouts won't occur - * because there's no granted data). - */ - candidate->silent_ticks = 0; - - /* Create a grant for this message. */ - candidate->msgin.incoming = new_grant; - granted_bytes += increment; - available -= increment; - homa->grant_nonfifo_left -= increment; - atomic_inc(&candidate->grants_in_progress); - rpcs[num_grants] = candidate; - grant = &grants[num_grants]; - num_grants++; - grant->offset = htonl(new_grant); - priority = homa->max_sched_prio - (rank - 1); - extra_levels = homa->max_sched_prio + 1 - num_grantable_peers; - if (extra_levels >= 0) - priority -= extra_levels; - if (priority < 0) - priority = 0; - grant->priority = priority; - tt_record4("sending grant for id %llu, offset %d, priority %d, " - "increment %d", - candidate->id, new_grant, priority, increment); - if (new_grant == candidate->msgin.total_length) - homa_remove_grantable_locked(homa, candidate); - if (num_grants == MAX_GRANTS) + if (rpc->msgin.length >= 0 && + rpc->msgin.bytes_remaining == 0 && + skb_queue_len(&rpc->msgin.packets) == 0) { + tt_record2("homa_wait_private found rpc id %d, pid %d via null, blocked 0", + rpc->id, current->pid); break; - } - - if (homa->grant_nonfifo_left <= 0) { - homa->grant_nonfifo_left += homa->grant_nonfifo; - if ((num_grantable_peers > homa->max_overcommit) - && homa->grant_fifo_fraction) - granted_bytes += homa_grant_fifo(homa); - } - - atomic_add(granted_bytes, &homa->total_incoming); - homa_grantable_unlock(homa); - - /* By sending grants without holding grantable_lock here, we reduce - * contention on that lock significantly. This only works because - * rpc->grants_in_progress keeps the RPC from being deleted out from - * under us. - */ - for (i = 0; i < num_grants; i++) { - /* Send any accumulated grants (ignore errors). */ - BUG_ON(rpcs[i]->magic != HOMA_RPC_MAGIC); - homa_xmit_control(GRANT, &grants[i], sizeof(grants[i]), - rpcs[i]); - atomic_dec(&rpcs[i]->grants_in_progress); - } - INC_METRIC(grant_cycles, get_cycles() - start); -} - -/** - * homa_grant_fifo() - This function is invoked occasionally to give - * a high-priority grant to the oldest incoming message. We do this in - * order to reduce the starvation that SRPT can cause for long messages. - * @homa: Overall data about the Homa protocol implementation. The - * grantable_lock must be held by the caller. - * Return: The number of bytes of additional grants that were issued. - */ -int homa_grant_fifo(struct homa *homa) -{ - struct homa_rpc *candidate, *oldest; - __u64 oldest_birth; - struct homa_peer *peer; - struct grant_header grant; - int granted; - - oldest = NULL; - oldest_birth = ~0; - - /* Find the oldest message that doesn't currently have an - * outstanding "pity grant". - */ - list_for_each_entry(peer, &homa->grantable_peers, grantable_links) { - list_for_each_entry(candidate, &peer->grantable_rpcs, - grantable_links) { - int received, on_the_way; - - if (candidate->msgin.birth >= oldest_birth) - continue; - - received = (candidate->msgin.total_length - - candidate->msgin.bytes_remaining); - on_the_way = candidate->msgin.incoming - received; - if (on_the_way > homa->rtt_bytes) { - /* The last "pity" grant hasn't been used - * up yet. - */ - continue; - } - oldest = candidate; - oldest_birth = candidate->msgin.birth; } - } - if (oldest == NULL) - return 0; - INC_METRIC(fifo_grants, 1); - if ((oldest->msgin.total_length - oldest->msgin.bytes_remaining) - == oldest->msgin.incoming) - INC_METRIC(fifo_grants_no_incoming, 1); - - oldest->silent_ticks = 0; - granted = homa->fifo_grant_increment; - oldest->msgin.incoming += granted; - if (oldest->msgin.incoming >= oldest->msgin.total_length) { - granted -= oldest->msgin.incoming - oldest->msgin.total_length; - oldest->msgin.incoming = oldest->msgin.total_length; - homa_remove_grantable_locked(homa, oldest); - } - grant.offset = htonl(oldest->msgin.incoming); - grant.priority = homa->max_sched_prio; - tt_record3("sending fifo grant for id %llu, offset %d, priority %d", - oldest->id, oldest->msgin.incoming, - homa->max_sched_prio); - homa_xmit_control(GRANT, &grant, sizeof(grant), oldest); - return granted; -} -/** - * homa_remove_grantable_locked() - This method does all the real work of - * homa_remove_from_grantable, but it assumes that the caller holds the - * grantable lock, so it can be used by other functions that already - * hold the lock. - * @homa: Overall data about the Homa protocol implementation. - * @rpc: RPC that is no longer grantable. Must be locked, and must - * currently be linked into grantable lists. - */ -void homa_remove_grantable_locked(struct homa *homa, struct homa_rpc *rpc) -{ - struct homa_rpc *head; - struct homa_peer *peer = rpc->peer; - struct homa_rpc *candidate; - - head = list_first_entry(&peer->grantable_rpcs, - struct homa_rpc, grantable_links); - list_del_init(&rpc->grantable_links); - if (rpc != head) - return; - - /* The removed RPC was at the front of the peer's list. This means - * we may have to adjust the position of the peer in Homa's list, - * or perhaps remove it. - */ - if (list_empty(&peer->grantable_rpcs)) { - homa->num_grantable_peers--; - list_del_init(&peer->grantable_links); - return; - } - - /* The peer may have to move down in Homa's list (removal of - * an RPC can't cause the peer to move up). - */ - head = list_first_entry(&peer->grantable_rpcs, - struct homa_rpc, grantable_links); - while (peer != list_last_entry(&homa->grantable_peers, struct homa_peer, - grantable_links)) { - struct homa_peer *next_peer = list_next_entry( - peer, grantable_links); - candidate = list_first_entry(&next_peer->grantable_rpcs, - struct homa_rpc, grantable_links); - if (candidate->msgin.bytes_remaining - > head->msgin.bytes_remaining) + if (nonblocking) { + result = -EAGAIN; + IF_NO_STRIP(avail_immediately = 0); break; - __list_del_entry(&peer->grantable_links); - list_add(&peer->grantable_links, &next_peer->grantable_links); - } -} - -/** - * homa_remove_from_grantable() - This method ensures that an RPC - * is no longer linked into peer->grantable_rpcs (i.e. it won't be - * visible to homa_manage_grants). - * @homa: Overall data about the Homa protocol implementation. - * @rpc: RPC that is being destroyed. Must be locked. - */ -void homa_remove_from_grantable(struct homa *homa, struct homa_rpc *rpc) -{ - UNIT_LOG("; ", "homa_remove_from_grantable invoked"); - /* In order to determine for sure whether an RPC is in the - * grantable_rpcs we would need to acquire homa_grantable_lock, - * which is expensive because it's global. Howevever, we can - * check whether the RPC is queued without acquiring the lock, - * and if it's not, then we don't need to acquire the lock (the - * RPC can't get added to the queue without locking it, and we own - * the RPC's lock). If it is in the queue, then we have to require - * homa_grantable_lock and check again (it could have gotten - * removed in the meantime). - */ - if (list_empty(&rpc->grantable_links)) - return; - homa_grantable_lock(homa); - if (!list_empty(&rpc->grantable_links)) { - homa_remove_grantable_locked(homa, rpc); - homa_grantable_unlock(homa); - homa_send_grants(homa); - } else - homa_grantable_unlock(homa); -} - -/** - * homa_log_grantable_list() - Print information about the entries on the - * grantable list to the kernel log. This is intended for debugging use - * via the log_topic sysctl parameter. - * @homa: Overall data about the Homa protocol implementation. - */ -void homa_log_grantable_list(struct homa *homa) -{ - int bucket, count; - struct homa_peer *peer, *peer2; - struct homa_rpc *rpc; - - printk(KERN_NOTICE "Logging Homa grantable list\n"); - homa_grantable_lock(homa); - for (bucket = 0; bucket < HOMA_PEERTAB_BUCKETS; bucket++) { - hlist_for_each_entry_rcu(peer, &homa->peers.buckets[bucket], - peertab_links) { - printk(KERN_NOTICE "Checking peer %s\n", - homa_print_ipv6_addr(&peer->addr)); - if (list_empty(&peer->grantable_rpcs)) - continue; - count = 0; - list_for_each_entry(rpc, &peer->grantable_rpcs, - grantable_links) { - count++; - if (count > 10) - continue; - homa_rpc_log(rpc); - } - printk(KERN_NOTICE "Peer %s has %d grantable RPCs\n", - homa_print_ipv6_addr(&peer->addr), - count); - list_for_each_entry(peer2, &homa->grantable_peers, - grantable_links) { - if (peer2 == peer) - goto next_peer; - } - printk(KERN_NOTICE "Peer %s has grantable RPCs but " - "isn't on homa->grantable_peers\n", - homa_print_ipv6_addr(&peer->addr)); - next_peer: - continue; - } - } - homa_grantable_unlock(homa); - printk(KERN_NOTICE "Finished logging Homa grantable list\n"); -} - -/** - * homa_rpc_abort() - Terminate an RPC and arrange for an error to be returned - * to the application. - * @crpc: RPC to be terminated. Must be a client RPC. - * @error: A negative errno value indicating the error that caused the abort. - */ -void homa_rpc_abort(struct homa_rpc *crpc, int error) -{ - homa_remove_from_grantable(crpc->hsk->homa, crpc); - crpc->error = error; - homa_sock_lock(crpc->hsk, "homa_rpc_abort"); - if (!crpc->hsk->shutdown) - homa_rpc_handoff(crpc); - homa_sock_unlock(crpc->hsk); -} - -/** - * homa_abort_rpcs() - Abort all RPCs to/from a particular peer. - * @homa: Overall data about the Homa protocol implementation. - * @addr: Address (network order) of the destination whose RPCs are - * to be aborted. - * @port: If nonzero, then RPCs will only be aborted if they were - * targeted at this server port. - * @error: Negative errno value indicating the reason for the abort. - */ -void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, - int port, int error) -{ - struct homa_socktab_scan scan; - struct homa_sock *hsk; - struct homa_rpc *rpc, *tmp; - - rcu_read_lock(); - for (hsk = homa_socktab_start_scan(&homa->port_map, &scan); - hsk != NULL; hsk = homa_socktab_next(&scan)) { - /* Skip the (expensive) lock acquisition if there's no - * work to do. - */ - if (list_empty(&hsk->active_rpcs)) - continue; - if (!homa_protect_rpcs(hsk)) - continue; - list_for_each_entry_safe(rpc, tmp, &hsk->active_rpcs, - active_links) { - if (!ipv6_addr_equal(&rpc->peer->addr, addr)) - continue; - if ((port != 0) && (rpc->dport != port)) - continue; - homa_rpc_lock(rpc); - if (homa_is_client(rpc->id)) { - tt_record3("aborting client RPC: peer 0x%x, " - "id %u, error %d", - tt_addr(rpc->peer->addr), - rpc->id, error); - homa_rpc_abort(rpc, error); - } else { - INC_METRIC(server_rpc_discards, 1); - tt_record3("discarding server RPC: peer 0x%x, " - "id %d, error %d", - tt_addr(rpc->peer->addr), - rpc->id, error); - homa_rpc_free(rpc); - } - homa_rpc_unlock(rpc); - } - homa_unprotect_rpcs(hsk); - } - rcu_read_unlock(); -} - -/** - * homa_abort_rpcs() - Abort all outgoing (client-side) RPCs on a given socket. - * @hsk: Socket whose RPCs should be aborted. - * @error: Zero means that the aborted RPCs should be freed immediately. - * A nonzero value means that the RPCs should be marked - * complete, so that they can be returned to the application; - * this value (a negative errno) will be returned from - * recvmsg. - */ -void homa_abort_sock_rpcs(struct homa_sock *hsk, int error) -{ - struct homa_rpc *rpc, *tmp; - - rcu_read_lock(); - if (list_empty(&hsk->active_rpcs)) - goto done; - if (!homa_protect_rpcs(hsk)) - goto done; - list_for_each_entry_safe(rpc, tmp, &hsk->active_rpcs, active_links) { - if (!homa_is_client(rpc->id)) - continue; - homa_rpc_lock(rpc); - if (rpc->state == RPC_DEAD) { - homa_rpc_unlock(rpc); - continue; } - tt_record4("homa_abort_sock_rpcs aborting id %u on port %d, " - "peer 0x%x, error %d", - rpc->id, hsk->port, - tt_addr(rpc->peer->addr), error); - if (error) { - homa_rpc_abort(rpc, error); - } else - homa_rpc_free(rpc); - homa_rpc_unlock(rpc); - } - homa_unprotect_rpcs(hsk); - done: - rcu_read_unlock(); -} -/** - * homa_register_interests() - Records information in various places so - * that a thread will be woken up if an RPC that it cares about becomes - * available. - * @interest: Used to record information about the messages this thread is - * waiting on. The initial contents of the structure are - * assumed to be undefined. - * @hsk: Socket on which relevant messages will arrive. Must not be - * locked. - * @flags: Flags field from homa_recvmsg_args; see manual entry for - * details. - * @id: If non-zero, then the caller is interested in receiving - * the response for this RPC (@id must be a client request). - * Return: Either zero or a negative errno value. If a matching RPC - * is already available, information about it will be stored in - * interest. - */ -int homa_register_interests(struct homa_interest *interest, - struct homa_sock *hsk, int flags, __u64 id) -{ - struct homa_rpc *rpc = NULL; - - homa_interest_init(interest); - interest->locked = 1; - if (id != 0) { - if (!homa_is_client(id)) - return -EINVAL; - rpc = homa_find_client_rpc(hsk, id); - if (rpc == NULL) - return -EINVAL; - if ((rpc->interest != NULL) && (rpc->interest != interest)) { - homa_rpc_unlock(rpc); - return -EINVAL; - } - } - - /* Need both the RPC lock (acquired above) and the socket lock to - * avoid races. - */ - homa_sock_lock(hsk, "homa_register_interests"); - if (hsk->shutdown) { - homa_sock_unlock(hsk); - if (rpc) - homa_rpc_unlock(rpc); - return -ESHUTDOWN; - } + result = homa_interest_init_private(&interest, rpc); + if (result != 0) + break; - if (id != 0) { - if ((atomic_read(&rpc->flags) & RPC_PKTS_READY) || rpc->error) - goto claim_rpc; - rpc->interest = interest; - interest->reg_rpc = rpc; homa_rpc_unlock(rpc); - } - - interest->locked = 0; - if (flags & HOMA_RECVMSG_RESPONSE) { - if (!list_empty(&hsk->ready_responses)) { - rpc = list_first_entry( - &hsk->ready_responses, - struct homa_rpc, - ready_links); - goto claim_rpc; - } - /* Insert this thread at the *front* of the list; - * we'll get better cache locality if we reuse - * the same thread over and over, rather than - * round-robining between threads. Same below. + result = homa_interest_wait(&interest); +#ifndef __STRIP__ /* See strip.py */ + avail_immediately = 0; + blocked |= interest.blocked; +#endif /* See strip.py */ + + homa_rpc_lock_preempt(rpc); + homa_interest_unlink_private(&interest); + tt_record3("homa_wait_private found rpc id %d, pid %d via handoff, blocked %d", + rpc->id, current->pid, interest.blocked); + + /* Abort on error, but if the interest actually got ready + * in the meantime the ignore the error (loop back around + * to process the RPC). */ - list_add(&interest->response_links, - &hsk->response_interests); - } - if (flags & HOMA_RECVMSG_REQUEST) { - if (!list_empty(&hsk->ready_requests)) { - rpc = list_first_entry(&hsk->ready_requests, - struct homa_rpc, ready_links); - /* Make sure the interest isn't on the response list; - * otherwise it might receive a second RPC. - */ - if (interest->response_links.next != LIST_POISON1) - list_del(&interest->response_links); - goto claim_rpc; - } - list_add(&interest->request_links, &hsk->request_interests); - } - homa_sock_unlock(hsk); - return 0; - - claim_rpc: - list_del_init(&rpc->ready_links); - if (!list_empty(&hsk->ready_requests) || - !list_empty(&hsk->ready_responses)) { - // There are still more RPCs available, so let Linux know. - hsk->sock.sk_data_ready(&hsk->sock); + if (result != 0 && atomic_read(&interest.ready) == 0) + break; } - /* This flag is needed to keep the RPC from being reaped during the - * gap between when we release the socket lock and we acquire the - * RPC lock.*/ - atomic_or(RPC_HANDING_OFF, &rpc->flags); - homa_sock_unlock(hsk); - if (!interest->locked) { - homa_rpc_lock(rpc); - interest->locked = 1; +#ifndef __STRIP__ /* See strip.py */ + if (avail_immediately) { + INC_METRIC(wait_none, 1); + } else if (result == 0) { + if (blocked) + INC_METRIC(wait_block, 1); + else + INC_METRIC(wait_fast, 1); } - atomic_andnot(RPC_HANDING_OFF, &rpc->flags); - atomic_long_set_release(&interest->ready_rpc, (long) rpc); - return 0; +#endif /* See strip.py */ + return result; } /** - * @homa_wait_for_message() - Wait for receipt of an incoming message - * that matches the parameters. Various other activities can occur while - * waiting, such as reaping dead RPCs and copying data to user space. - * @hsk: Socket where messages will arrive. - * @flags: Flags field from homa_recvmsg_args; see manual entry for - * details. - * @id: If non-zero, then a response message matching this id may - * be returned (@id must refer to a client request). + * homa_wait_shared() - Wait for the completion of any non-private + * incoming message on a socket. + * @hsk: Socket on which to wait. Must not be locked. + * @nonblocking: Nonzero means return immediately if no RPC is ready. * - * Return: Pointer to an RPC that matches @flags and @id, or a negative - * errno value. The RPC will be locked; the caller must unlock. + * Return: Pointer to an RPC with a complete incoming message or nonzero + * error field, or a negative errno (usually -EINTR). If an RPC + * is returned it will be locked and referenced; the caller + * must release the lock and the reference. */ -struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, - __u64 id) +struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) + __cond_acquires(rpc->bucket->lock) { - struct homa_rpc *result = NULL; struct homa_interest interest; - struct homa_rpc *rpc = NULL; - uint64_t poll_start, now; - int error, blocked = 0, polled = 0; - - /* Each iteration of this loop finds an RPC, but it might not be - * in a state where we can return it (e.g., there might be packets - * ready to transfer to user space, but the incoming message isn't yet - * complete). Thus it could take many iterations of this loop - * before we have an RPC with a complete message. - */ - while (1) { - error = homa_register_interests(&interest, hsk, flags, id); - rpc = (struct homa_rpc *) atomic_long_read(&interest.ready_rpc); - if (rpc) { - goto found_rpc; - } - if (error < 0) { - result = ERR_PTR(error); - goto found_rpc; - } + struct homa_rpc *rpc; + int result; -// tt_record3("Preparing to poll, socket %d, flags 0x%x, pid %d", -// hsk->client_port, flags, current->pid); + IF_NO_STRIP(int avail_immediately = 1); + IF_NO_STRIP(int blocked = 0); - /* There is no ready RPC so far. Clean up dead RPCs before - * going to sleep (or returning, if in nonblocking mode). - */ - while (1) { - int reaper_result; - rpc = (struct homa_rpc *) atomic_long_read( - &interest.ready_rpc); - if (rpc) { - tt_record1("received RPC handoff while reaping, id %d", - rpc->id); - goto found_rpc; - } - reaper_result = homa_rpc_reap(hsk, - hsk->homa->reap_limit); - if (reaper_result == 0) - break; - - /* Give NAPI and SoftIRQ tasks a chance to run. */ - schedule(); - } - tt_record1("Checking nonblocking, flags %d", flags); - if (flags & HOMA_RECVMSG_NONBLOCKING) { - result = ERR_PTR(-EAGAIN); - goto found_rpc; + INIT_LIST_HEAD(&interest.links); + init_waitqueue_head(&interest.wait_queue); + /* Each iteration through this loop waits until an RPC needs attention + * in some way (e.g. packets have arrived), then deals with that need + * (e.g. copy to user space). It may take many iterations until an + * RPC is ready for the application. + */ + while (1) { + homa_sock_lock(hsk); + if (hsk->shutdown) { + rpc = ERR_PTR(-ESHUTDOWN); + homa_sock_unlock(hsk); + goto done; } - - /* Busy-wait for a while before going to sleep; this avoids - * context-switching overhead to wake up. - */ - poll_start = now = get_cycles(); - while (1) { - __u64 blocked; - rpc = (struct homa_rpc *) atomic_long_read( - &interest.ready_rpc); - if (rpc) { - tt_record3("received RPC handoff while polling, id %d, socket %d, pid %d", - rpc->id, hsk->port, - current->pid); - polled = 1; - INC_METRIC(poll_cycles, now - poll_start); - goto found_rpc; - } - if (now >= (poll_start + hsk->homa->poll_cycles)) - break; - blocked = get_cycles(); - schedule(); - now = get_cycles(); - blocked = now - blocked; - if (blocked > 5000) { - /* Looks like another thread ran (or perhaps - * SoftIRQ). Count this time as blocked. + if (!list_empty(&hsk->ready_rpcs)) { + rpc = list_first_entry(&hsk->ready_rpcs, + struct homa_rpc, + ready_links); + tt_record2("homa_wait_shared found rpc id %d, pid %d via ready_rpcs, blocked 0", + rpc->id, current->pid); + homa_rpc_hold(rpc); + list_del_init(&rpc->ready_links); + if (!list_empty(&hsk->ready_rpcs)) { + /* There are still more RPCs available, so + * let Linux know. */ - INC_METRIC(blocked_cycles, blocked); - poll_start += blocked; + hsk->sock.sk_data_ready(&hsk->sock); } - } - tt_record2("Poll ended unsuccessfully on socket %d, pid %d", - hsk->port, current->pid); - INC_METRIC(poll_cycles, now - poll_start); - - /* Now it's time to sleep. */ - set_current_state(TASK_INTERRUPTIBLE); - rpc = (struct homa_rpc *) atomic_long_read(&interest.ready_rpc); - if (!rpc && !hsk->shutdown) { - __u64 end; - __u64 start = get_cycles(); - tt_record1("homa_wait_for_message sleeping, pid %d", - current->pid); - schedule(); - end = get_cycles(); - blocked = 1; - INC_METRIC(blocked_cycles, end - start); - } - __set_current_state(TASK_RUNNING); - -found_rpc: - /* If we get here, it means either an RPC is ready for our - * attention or an error occurred. - * - * First, clean up all of the interests. Must do this before - * making any other decisions, because until we do, an incoming - * message could still be passed to us. Note: if we went to - * sleep, then this info was already cleaned up by whoever - * woke us up. Also, values in the interest may change between - * when we test them below and when we acquire the socket lock, - * so they have to be checked again after locking the socket. - */ - UNIT_HOOK("found_rpc"); - if ((interest.reg_rpc) - || (interest.request_links.next != LIST_POISON1) - || (interest.response_links.next - != LIST_POISON1)) { - homa_sock_lock(hsk, "homa_wait_for_message"); - if (interest.reg_rpc) - interest.reg_rpc->interest = NULL; - if (interest.request_links.next != LIST_POISON1) - list_del(&interest.request_links); - if (interest.response_links.next != LIST_POISON1) - list_del(&interest.response_links); homa_sock_unlock(hsk); - } + } else if (nonblocking) { + rpc = ERR_PTR(-EAGAIN); + homa_sock_unlock(hsk); + IF_NO_STRIP(avail_immediately = 0); - /* Now check to see if we received an RPC handoff (note that - * this could have happened anytime up until we reset the - * interests above). - */ - rpc = (struct homa_rpc *) atomic_long_read(&interest.ready_rpc); - if (rpc) { - tt_record2("homa_wait_for_message found rpc id %d, pid %d", - rpc->id, current->pid); - if (!interest.locked) - homa_rpc_lock(rpc); - atomic_andnot(RPC_HANDING_OFF, &rpc->flags); - if (rpc->state == RPC_DEAD) { - homa_rpc_unlock(rpc); - continue; + /* This is a good time to cleanup dead RPCS. */ + homa_rpc_reap(hsk, false); + goto done; + } else { + homa_interest_init_shared(&interest, hsk); + homa_sock_unlock(hsk); + result = homa_interest_wait(&interest); +#ifndef __STRIP__ /* See strip.py */ + avail_immediately = 0; + blocked |= interest.blocked; +#endif /* See strip.py */ + + if (result != 0) { + int ready; + + /* homa_interest_wait returned an error, so we + * have to do two things. First, unlink the + * interest from the socket. Second, check to + * see if in the meantime the interest received + * a handoff. If so, ignore the error. Very + * important to hold the socket lock while + * checking, in order to eliminate races with + * homa_rpc_handoff. + */ + homa_sock_lock(hsk); + homa_interest_unlink_shared(&interest); + ready = atomic_read(&interest.ready); + homa_sock_unlock(hsk); + if (ready == 0) { + rpc = ERR_PTR(result); + goto done; + } } - if (!rpc->error) - rpc->error = homa_copy_to_user(rpc); - if (rpc->error) - goto done; - atomic_andnot(RPC_PKTS_READY, &rpc->flags); - if (rpc->msgin.copied_out == rpc->msgin.total_length) + + rpc = interest.rpc; + if (!rpc) { + rpc = ERR_PTR(-ESHUTDOWN); goto done; - homa_rpc_unlock(rpc); + } + tt_record3("homa_wait_shared found rpc id %d, pid %d via handoff, blocked %d", + rpc->id, current->pid, interest.blocked); } - /* A complete message isn't available: check for errors. */ - if (IS_ERR(result)) - return result; - if (signal_pending(current)) - return ERR_PTR(-EINTR); - - /* No message and no error; try again. */ + homa_rpc_lock_preempt(rpc); + if (!rpc->error) + rpc->error = homa_copy_to_user(rpc); + if (rpc->error) { + if (rpc->state != RPC_DEAD) + break; + } else if (rpc->msgin.bytes_remaining == 0 && + skb_queue_len(&rpc->msgin.packets) == 0) + break; + homa_rpc_put(rpc); + homa_rpc_unlock(rpc); } done: - if (blocked) - INC_METRIC(slow_wakeups, 1); - else if (polled) - INC_METRIC(fast_wakeups, 1); +#ifndef __STRIP__ /* See strip.py */ + if (avail_immediately) { + INC_METRIC(wait_none, 1); + } else if (!IS_ERR(rpc)) { + if (blocked) + INC_METRIC(wait_block, 1); + else + INC_METRIC(wait_fast, 1); + } +#endif /* See strip.py */ return rpc; - } /** - * @homa_rpc_handoff: This function is called when the input message for - * an RPC is ready for attention from a user thread. It either notifies - * a waiting reader or queues the RPC. - * @rpc: RPC to handoff; must be locked. The caller must - * also have locked the socket for this RPC. + * homa_rpc_handoff() - This function is called when the input message for + * an RPC is ready for attention from a user thread. It notifies a waiting + * reader and/or queues the RPC, as appropriate. + * @rpc: RPC to handoff; must be locked. */ void homa_rpc_handoff(struct homa_rpc *rpc) + __must_hold(rpc->bucket->lock) { - struct homa_interest *interest; struct homa_sock *hsk = rpc->hsk; + struct homa_interest *interest; - if ((atomic_read(&rpc->flags) & RPC_HANDING_OFF) - || !list_empty(&rpc->ready_links)) + if (test_bit(RPC_PRIVATE, &rpc->flags)) { + homa_interest_notify_private(rpc); return; - - /* First, see if someone is interested in this RPC specifically. - */ - if (rpc->interest) { - interest = rpc->interest; - goto thread_waiting; } - /* Second, check the interest list for this type of RPC. */ - if (homa_is_client(rpc->id)) { - interest = list_first_entry_or_null( - &hsk->response_interests, - struct homa_interest, response_links); - if (interest) - goto thread_waiting; - list_add_tail(&rpc->ready_links, &hsk->ready_responses); - INC_METRIC(responses_queued, 1); - } else { - interest = list_first_entry_or_null( - &hsk->request_interests, - struct homa_interest, request_links); - if (interest) - goto thread_waiting; - list_add_tail(&rpc->ready_links, &hsk->ready_requests); - INC_METRIC(requests_queued, 1); - } - - /* If we get here, no-one is waiting for the RPC, so it has been - * queued. - */ - - /* Notify the poll mechanism. */ - hsk->sock.sk_data_ready(&hsk->sock); - tt_record2("homa_rpc_handoff finished queuing id %d for port %d", - rpc->id, hsk->port); - return; - -thread_waiting: - /* We found a waiting thread. The following 3 lines must be here, - * before clearing the interest, in order to avoid a race with - * homa_wait_for_message (which won't acquire the socket lock if - * the interest is clear). + /* Shared RPC; if there is a waiting thread, hand off the RPC; + * otherwise enqueue it. */ - atomic_or(RPC_HANDING_OFF, &rpc->flags); - interest->locked = 0; - atomic_long_set_release(&interest->ready_rpc, (long) rpc); - - /* Clear the interest. This serves two purposes. First, it saves - * the waking thread from acquiring the socket lock again, which - * reduces contention on that lock). Second, it ensures that - * no-one else attempts to give this interest a different RPC. - */ - if (interest->reg_rpc) { - interest->reg_rpc->interest = NULL; - interest->reg_rpc = NULL; + homa_sock_lock(hsk); + if (hsk->shutdown) { + homa_sock_unlock(hsk); + return; + } + if (!list_empty(&hsk->interests)) { +#ifndef __STRIP__ /* See strip.py */ + interest = homa_choose_interest(hsk); +#else /* See strip.py */ + interest = list_first_entry(&hsk->interests, + struct homa_interest, links); +#endif /* See strip.py */ + list_del_init(&interest->links); + interest->rpc = rpc; + homa_rpc_hold(rpc); + tt_record1("homa_rpc_handoff handing off id %d", rpc->id); + atomic_set_release(&interest->ready, 1); + wake_up(&interest->wait_queue); + INC_METRIC(handoffs_thread_waiting, 1); + +#ifndef __STRIP__ /* See strip.py */ + /* Update the last_app_active time for the thread's core, so + * Homa will try to avoid assigning any work there. + */ + per_cpu(homa_offload_core, interest->core).last_app_active = + homa_clock(); +#endif /* See strip.py */ + } else if (list_empty(&rpc->ready_links)) { + list_add_tail(&rpc->ready_links, &hsk->ready_rpcs); + hsk->sock.sk_data_ready(&hsk->sock); + tt_record2("homa_rpc_handoff queued id %d for port %d", + rpc->id, hsk->port); } - if (interest->request_links.next != LIST_POISON1) - list_del(&interest->request_links); - if (interest->response_links.next != LIST_POISON1) - list_del(&interest->response_links); - wake_up_process(interest->thread); - tt_record3("homa_rpc_handoff handed off id %d to pid %d on core %d", - rpc->id, interest->thread->pid, - task_cpu(interest->thread)); + homa_sock_unlock(hsk); } +#ifndef __STRIP__ /* See strip.py */ /** * homa_incoming_sysctl_changed() - Invoked whenever a sysctl value is changed; * any input-related parameters that depend on sysctl-settable values. @@ -1811,35 +1328,10 @@ void homa_rpc_handoff(struct homa_rpc *rpc) */ void homa_incoming_sysctl_changed(struct homa *homa) { - __u64 tmp; - - homa->max_incoming = homa->max_overcommit * homa->rtt_bytes; - - if (homa->grant_fifo_fraction > 500) - homa->grant_fifo_fraction = 500; - tmp = homa->grant_fifo_fraction; - if (tmp != 0) - tmp = (1000*homa->fifo_grant_increment)/tmp - - homa->fifo_grant_increment; - homa->grant_nonfifo = tmp; - - /* Code below is written carefully to avoid integer underflow or - * overflow under expected usage patterns. Be careful when changing! - */ - tmp = homa->poll_usecs; - tmp = (tmp*cpu_khz)/1000; - homa->poll_cycles = tmp; - - tmp = homa->gro_busy_usecs; - tmp = (tmp*cpu_khz)/1000; - homa->gro_busy_cycles = tmp; - - tmp = homa->rtt_bytes * homa->duty_cycle; - homa->grant_threshold = tmp/1000; - if (homa->grant_threshold > homa->rtt_bytes) - homa->grant_threshold = homa->rtt_bytes; - - tmp = homa->bpage_lease_usecs; - tmp = (tmp*cpu_khz)/1000; - homa->bpage_lease_cycles = tmp; + homa->poll_cycles = homa_usecs_to_cycles(homa->poll_usecs); + homa->busy_cycles = homa_usecs_to_cycles(homa->busy_usecs); + homa->gro_busy_cycles = homa_usecs_to_cycles(homa->gro_busy_usecs); + homa->bpage_lease_cycles = + homa_usecs_to_cycles(homa->bpage_lease_usecs); } +#endif /* See strip.py */ diff --git a/homa_interest.c b/homa_interest.c new file mode 100644 index 00000000..40f96c0f --- /dev/null +++ b/homa_interest.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +/* This file contains functions for managing homa_interest structs. */ + +#include "homa_impl.h" +#include "homa_interest.h" +#include "homa_rpc.h" +#include "homa_sock.h" + +#ifndef __STRIP__ /* See strip.py */ +#include "homa_offload.h" +#endif /* See strip.py */ + +/** + * homa_interest_init_shared() - Initialize an interest and queue it up on + * a socket. + * @interest: Interest to initialize + * @hsk: Socket on which the interests should be queued. Must be locked + * by caller. + */ +void homa_interest_init_shared(struct homa_interest *interest, + struct homa_sock *hsk) + __must_hold(hsk->lock) +{ + interest->rpc = NULL; + atomic_set(&interest->ready, 0); + IF_NO_STRIP(interest->core = raw_smp_processor_id()); + interest->blocked = 0; + init_waitqueue_head(&interest->wait_queue); + interest->hsk = hsk; + list_add(&interest->links, &hsk->interests); +} + +/** + * homa_interest_init_private() - Initialize an interest that will wait + * on a particular (private) RPC, and link it to that RPC. + * @interest: Interest to initialize. + * @rpc: RPC to associate with the interest. Must be private, and + * caller must have locked it. + * + * Return: 0 for success, otherwise a negative errno. + */ +int homa_interest_init_private(struct homa_interest *interest, + struct homa_rpc *rpc) + __must_hold(rpc->bucket->lock) +{ + if (rpc->private_interest) + return -EINVAL; + + interest->rpc = rpc; + atomic_set(&interest->ready, 0); + IF_NO_STRIP(interest->core = raw_smp_processor_id()); + interest->blocked = 0; + init_waitqueue_head(&interest->wait_queue); + interest->hsk = rpc->hsk; + rpc->private_interest = interest; + return 0; +} + +/** + * homa_interest_wait() - Wait for an interest to have an actionable RPC, + * or for an error to occur. + * @interest: Interest to wait for; must previously have been initialized + * and linked to a socket or RPC. On return, the interest + * will have been unlinked if its ready flag is set; otherwise + * it may still be linked. + * + * Return: 0 for success (the ready flag is set in the interest), or -EINTR + * if the thread received an interrupt. + */ +int homa_interest_wait(struct homa_interest *interest) +{ + struct homa_sock *hsk = interest->hsk; + int result = 0; + int iteration; + int wait_err; + +#ifndef __STRIP__ /* See strip.py */ + u64 start, block_start, blocked_time, now; + + start = homa_clock(); + blocked_time = 0; +#endif /* See strip.py */ + interest->blocked = 0; + + /* This loop iterates in order to poll and/or reap dead RPCS. */ + for (iteration = 0; ; iteration++) { + if (iteration != 0) + /* Give NAPI/SoftIRQ tasks a chance to run. */ + schedule(); + + if (atomic_read_acquire(&interest->ready) != 0) + goto done; + + /* See if we can cleanup dead RPCs while waiting. */ + if (homa_rpc_reap(hsk, false) != 0) + continue; + +#ifndef __STRIP__ /* See strip.py */ + now = homa_clock(); + per_cpu(homa_offload_core, + raw_smp_processor_id()).last_app_active = now; + if (now - start >= hsk->homa->poll_cycles) + break; +#else /* See strip.py */ + break; +#endif /* See strip.py */ + } + + interest->blocked = 1; + IF_NO_STRIP(block_start = now); + wait_err = wait_event_interruptible_exclusive(interest->wait_queue, + atomic_read_acquire(&interest->ready) != 0); + IF_NO_STRIP(blocked_time = homa_clock() - block_start); + if (wait_err == -ERESTARTSYS) + result = -EINTR; + +done: +#ifndef __STRIP__ /* See strip.py */ + if (interest->blocked) + INC_METRIC(blocked_cycles, blocked_time); + INC_METRIC(poll_cycles, homa_clock() - start - blocked_time); +#endif /* See strip.py */ + return result; +} + +/** + * homa_interest_notify_private() - If a thread is waiting on the private + * interest for an RPC, wake it up. + * @rpc: RPC that may (potentially) have a private interest. Must be + * locked by the caller. + */ +void homa_interest_notify_private(struct homa_rpc *rpc) + __must_hold(rpc->bucket->lock) +{ + if (rpc->private_interest) { + atomic_set_release(&rpc->private_interest->ready, 1); + wake_up(&rpc->private_interest->wait_queue); + } +} + +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_choose_interest() - Given all the interests registered for a socket, + * choose the best one to handle an incoming message. + * @hsk: Socket for which message is intended. Must be locked by caller, + * and must have at least one queued interest. + * Return: The interest to use. This function tries to pick an + * interest whose thread is running on a core that isn't + * currently busy doing Homa transport work. + */ +struct homa_interest *homa_choose_interest(struct homa_sock *hsk) + __must_hold(hsk->lock) +{ + u64 busy_time = homa_clock() - hsk->homa->busy_cycles; + struct homa_interest *interest, *first; + + first = list_first_entry(&hsk->interests, struct homa_interest, + links); + list_for_each_entry(interest, &hsk->interests, links) { + if (per_cpu(homa_offload_core, interest->core).last_active < + busy_time) { + if (interest != first) + INC_METRIC(handoffs_alt_thread, 1); + return interest; + } + } + + /* All interested threads are on busy cores; return the first, + * which is also the most recent one to be registered, hence + * most likely to have warm cache state. + */ + return first; +} +#endif /* See strip.py */ diff --git a/homa_interest.h b/homa_interest.h new file mode 100644 index 00000000..6a1e3c27 --- /dev/null +++ b/homa_interest.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ + +/* This file defines struct homa_interest and related functions. */ + +#ifndef _HOMA_INTEREST_H +#define _HOMA_INTEREST_H + +#include "homa_rpc.h" +#include "homa_sock.h" + +/** + * struct homa_interest - Holds info that allows applications to wait for + * incoming RPC messages. An interest can be either private, in which case + * the application is waiting for a single specific RPC response and the + * interest is referenced by an rpc->private_interest, or shared, in which + * case the application is waiting for any incoming message that isn't + * private and the interest is present on hsk->interests. + */ +struct homa_interest { + /** + * @rpc: If ready is set, then this holds an RPC that needs + * attention, or NULL if this is a shared interest and hsk has + * been shutdown. If ready is not set, this will be NULL if the + * interest is shared; if it's private, it holds the RPC the + * interest is associated with. If non-NULL, a reference has been + * taken on the RPC. + */ + struct homa_rpc *rpc; + + /** + * @ready: Nonzero means the interest is ready for attention: either + * there is an RPC that needs attention or @hsk has been shutdown. + */ + atomic_t ready; + +#ifndef __STRIP__ /* See strip.py */ + /** + * @core: Core on which homa_wait_*was invoked. This is a hint + * used for load balancing (see balance.txt). + */ + int core; +#endif /* See strip.py */ + + /** + * @blocked: Zero means a handoff was received without the thread + * needing to block; nonzero means the thread blocked. + */ + int blocked; + + /** + * @wait_queue: Used to block the thread while waiting (will never + * have more than one queued thread). + */ + struct wait_queue_head wait_queue; + + /** @hsk: Socket that the interest is associated with. */ + struct homa_sock *hsk; + + /** + * @links: If the interest is shared, used to link this object into + * @hsk->interests. + */ + struct list_head links; +}; + +/** + * homa_interest_unlink_shared() - Remove an interest from the list for a + * socket. Note: this can race with homa_rpc_handoff, so on return it's + * possible that the interest is ready. + * @interest: Interest to remove. Must have been initialized with + * homa_interest_init_shared. + */ +static inline void homa_interest_unlink_shared(struct homa_interest *interest) + __must_hold(hsk->lock) +{ + list_del_init(&interest->links); +} + +/** + * homa_interest_unlink_private() - Detach a private interest from its + * RPC. Note: this can race with homa_rpc_handoff, so on return it's + * possible that the interest is ready. + * @interest: Interest to remove. Must have been initialized with + * homa_interest_init_private. Its RPC must be locked by + * the caller. + */ +static inline void homa_interest_unlink_private(struct homa_interest *interest) + __must_hold(interest->rpc->bucket->lock) +{ + if (interest == interest->rpc->private_interest) + interest->rpc->private_interest = NULL; +} + +void homa_interest_init_shared(struct homa_interest *interest, + struct homa_sock *hsk); +int homa_interest_init_private(struct homa_interest *interest, + struct homa_rpc *rpc); +void homa_interest_notify_private(struct homa_rpc *rpc); +int homa_interest_wait(struct homa_interest *interest); + +#ifndef __STRIP__ /* See strip.py */ +struct homa_interest + *homa_choose_interest(struct homa_sock *hsk); +#endif /* See strip.py */ + +#endif /* _HOMA_INTEREST_H */ diff --git a/homa_lcache.h b/homa_lcache.h deleted file mode 100755 index dfca0c52..00000000 --- a/homa_lcache.h +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright (c) 2022, Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -/* This file implements homa_lcache objects */ - -#include "homa_impl.h" - -/** - * struct homa_lcache - Used to retain the lock for an RPC so that it can - * be reused efficiently (in particular, when processing a batch of packets, - * we want to keep the lock for the entire batch). - */ -struct homa_lcache { - /** @rpc: if non-NULL, this RPC is currently locked. */ - struct homa_rpc *rpc; -}; - -/** - * homa_lcache_init() - Constructor for homa_lcaches. - * @lc: The object to initialize; previous contents are discarded. - */ -static inline void homa_lcache_init(struct homa_lcache *lc) -{ - lc->rpc = NULL; -} - -/** - * homa_lcache_save() - Store info about a locked RPC. - * @lc: Lock cache in which to store info. Must be properly initialized; - * if it currently caches a lock, that lock is released. - * @rpc: RPC to cache: must be locked by caller. - */ -static inline void homa_lcache_save(struct homa_lcache *lc, - struct homa_rpc *rpc) -{ - if (lc->rpc) { - homa_rpc_unlock(lc->rpc); - } - lc->rpc = rpc; -} - -/** - * homa_lcache_release() - Unlock the cached RPC, if there is one. This must - * be invoked before abandoning the object. - * @lc: Lock cache. - */ -static inline void homa_lcache_release(struct homa_lcache *lc) -{ - if (lc->rpc) { - homa_rpc_unlock(lc->rpc); - } - lc->rpc = NULL; -} - -/** - * homa_lcache_get_server() - Check to see if a particular server RPC is - * locked. - * @lc: RPC lock cache to check - * @id: Id of the desired RPC - * @addr: Address of the peer machine for this RPC. - * @port: Peer's port for the RPC - * - * Return: if @lc has a cached lock for @id, return the corresponding - * RPC, otherwise return NULL. - */ -static inline struct homa_rpc *homa_lcache_get(struct homa_lcache *lc, - __u64 id, const struct in6_addr *addr, __u16 port) -{ - if ((lc->rpc != NULL) && (lc->rpc->id == id) - && ipv6_addr_equal(&lc->rpc->peer->addr, addr) - && (lc->rpc->dport == port)) - return lc->rpc; - return NULL; -} \ No newline at end of file diff --git a/homa_metrics.c b/homa_metrics.c new file mode 100644 index 00000000..dee3b123 --- /dev/null +++ b/homa_metrics.c @@ -0,0 +1,516 @@ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +/* This file contains various functions for managing Homa's performance + * counters. + */ + +#include "homa_impl.h" + +DEFINE_PER_CPU(struct homa_metrics, homa_metrics); + +/* Describes file operations implemented for /proc/net/homa_metrics. */ +static const struct proc_ops homa_metrics_ops = { + .proc_open = homa_metrics_open, + .proc_read = homa_metrics_read, + .proc_lseek = homa_metrics_lseek, + .proc_release = homa_metrics_release, +}; + +/* Global information used to export metrics information through a file in + * /proc. + */ +struct homa_metrics_output homa_mout; + +/** + * homa_metrics_init() - Initialize global information related to metrics. + * Return: 0 for success, otherwise a negative errno. + */ +int homa_metrics_init(void) +{ + mutex_init(&homa_mout.mutex); + homa_mout.output = NULL; + homa_mout.dir_entry = proc_create("homa_metrics", 0444, + init_net.proc_net, + &homa_metrics_ops); + if (!homa_mout.dir_entry) { + pr_err("couldn't create /proc/net/homa_metrics\n"); + return -ENOMEM; + } + return 0; +} + +/** + * homa_metrics_end() - Called to clean up metrics information when the + * Homa module unloads. + */ +void homa_metrics_end(void) +{ + if (homa_mout.dir_entry) + proc_remove(homa_mout.dir_entry); + homa_mout.dir_entry = NULL; + kfree(homa_mout.output); + homa_mout.output = NULL; +} + +/** + * homa_metric_append() - Format a metric and append it to homa_mout.output. + * @name: Name of the metric + * @value: Value of the metric + * @format: Standard printf-style format string providing a human- + * readable description of the metric. Arguments after this + * provide the usual values expected for printf-like functions, + * if needed. + */ +void homa_metric_append(const char *name, u64 value, const char *format, ...) +{ + char *new_buffer; + size_t new_chars; + va_list ap; + + if (!homa_mout.output) { +#ifdef __UNIT_TEST__ + homa_mout.capacity = 200; +#else + homa_mout.capacity = 4096; +#endif + homa_mout.output = kmalloc(homa_mout.capacity, GFP_KERNEL); + if (!homa_mout.output) + return; + homa_mout.length = 0; + } + + while (homa_mout.capacity < homa_mout.length + 200) { + /* Not enough room; expand buffer capacity. */ + homa_mout.capacity *= 2; + new_buffer = kmalloc(homa_mout.capacity, GFP_KERNEL); + if (!new_buffer) + return; + memcpy(new_buffer, homa_mout.output, homa_mout.length); + kfree(homa_mout.output); + homa_mout.output = new_buffer; + } + + new_chars = snprintf(homa_mout.output + homa_mout.length, 60, + "%-30s %20llu ", name, value); + homa_mout.length += (new_chars > 60) ? 60 : new_chars; + va_start(ap, format); + new_chars = vsnprintf(homa_mout.output + homa_mout.length, 120, + format, ap); + va_end(ap); + homa_mout.length += (new_chars > 120) ? 120 : new_chars; +} + +/** + * homa_metrics_print() - Sample all of the Homa performance metrics and + * generate a human-readable string describing all of them. + * + * Return: The formatted string. + */ +char *homa_metrics_print(void) +{ + int core, i, lower = 0; + char name[30]; + + homa_mout.length = 0; +#define M(...) homa_metric_append(__VA_ARGS__) + M("time_cycles", homa_clock(), + "homa_clock() time when metrics were gathered\n"); + M("cpu_khz", homa_clock_khz(), + "Clock rate in khz\n"); + for (core = 0; core < nr_cpu_ids; core++) { + struct homa_metrics *m = &per_cpu(homa_metrics, core); + s64 delta; + + M("core", core, + "Core id for following metrics\n"); + for (i = 0; i < HOMA_NUM_SMALL_COUNTS; i++) { + snprintf(name, sizeof(name), "msg_bytes_%d", + (i + 1) * 64); + M(name, m->small_msg_bytes[i], + "Bytes in incoming messages containing %d-%d bytes\n", + lower, (i + 1) * 64); + lower = (i + 1) * 64 + 1; + } + for (i = (HOMA_NUM_SMALL_COUNTS * 64) / 1024; + i < HOMA_NUM_MEDIUM_COUNTS; i++) { + snprintf(name, sizeof(name), "msg_bytes_%d", + (i + 1) * 1024); + M(name, m->medium_msg_bytes[i], + "Bytes in incoming messages containing %d-%d bytes\n", + lower, (i + 1) * 1024); + lower = (i + 1) * 1024 + 1; + } + M("large_msg_count", m->large_msg_count, + "# of incoming messages >= %d bytes\n", lower); + M("large_msg_bytes", m->large_msg_bytes, + "Bytes in incoming messages >= %d bytes\n", lower); + M("client_requests_started", m->client_requests_started, + "Client RPCs initiated\n"); + M("client_request_bytes_started", + m->client_request_bytes_started, + "Request bytes in all initiated client RPCs\n"); + M("client_request_bytes_done", m->client_request_bytes_done, + "Transmitted request bytes in all client RPCs\n"); + M("client_requests_done", m->client_requests_done, + "Client RPC requests fully transmitted\n"); + + M("client_responses_started", m->client_responses_started, + "Client RPCs for which at least one response pkt recvd\n"); + M("client_response_bytes_started", + m->client_response_bytes_started, + "Response bytes in all RPCS in client_responses_started\n"); + M("client_response_bytes_done", m->client_response_bytes_done, + "Response bytes received for all client RPCs\n"); + M("client_responses_done", m->client_responses_done, + "Client RPC responses fully received\n"); + M("server_requests_started", m->server_requests_started, + "Server RPCs for which at least one request pkt rcvd\n"); + M("server_request_bytes_started", + m->server_request_bytes_started, + "Request bytes in all RPCS in server_requests_started\n"); + M("server_request_bytes_done", m->server_request_bytes_done, + "Request bytes received for all server RPCs\n"); + M("server_requests_done", m->server_requests_done, + "Server RPC requests fully received\n"); + M("server_responses_started", m->server_responses_started, + "Server RPCs for which response was initiated\n"); + M("server_response_bytes_started", + m->server_response_bytes_started, + "Message bytes in all initiated server responses\n"); + M("server_response_bytes_done", m->server_response_bytes_done, + "Transmitted response bytes in all server RPCs\n"); + M("server_responses_done", m->server_responses_done, + "Server RPC responses fully transmitted\n"); + M("sent_msg_bytes", m->sent_msg_bytes, + "Total bytes in all outgoing messages\n"); + for (i = DATA; i <= MAX_OP; i++) { + char *symbol = homa_symbol_for_type(i); + + snprintf(name, sizeof(name), "packets_sent_%s", symbol); + M(name, m->packets_sent[i - DATA], + "%s packets sent\n", symbol); + } + for (i = DATA; i <= MAX_OP; i++) { + char *symbol = homa_symbol_for_type(i); + + snprintf(name, sizeof(name), "packets_rcvd_%s", symbol); + M(name, m->packets_received[i - DATA], + "%s packets received\n", symbol); + } + for (i = 0; i < HOMA_MAX_PRIORITIES; i++) { + snprintf(name, sizeof(name), "priority%d_bytes", i); + M(name, m->priority_bytes[i], + "Bytes sent at priority %d (including headers)\n", i); + } + for (i = 0; i < HOMA_MAX_PRIORITIES; i++) { + snprintf(name, sizeof(name), "priority%d_packets", i); + M(name, m->priority_packets[i], + "Packets sent at priority %d\n", i); + } + M("skb_allocs", m->skb_allocs, "sk_buffs allocated\n"); + M("skb_alloc_cycles", m->skb_alloc_cycles, + "Time spent allocating sk_buffs\n"); + M("skb_frees", m->skb_frees, + "Data sk_buffs freed in normal paths\n"); + M("skb_free_cycles", m->skb_free_cycles, + "Time spent freeing data sk_buffs\n"); + M("skb_page_allocs", m->skb_page_allocs, + "Pages allocated for sk_buff frags\n"); + M("skb_page_alloc_cycles", m->skb_page_alloc_cycles, + "Time spent allocating pages for sk_buff frags\n"); + M("requests_received", m->requests_received, + "Incoming request messages\n"); + M("responses_received", m->responses_received, + "Incoming response messages\n"); + M("wait_none", m->wait_none, + "Messages received without blocking or polling\n"); + M("wait_fast", m->wait_fast, + "Messages received while polling\n"); + M("wait_block", m->wait_block, + "Messages received after thread went to sleep\n"); + M("handoffs_thread_waiting", m->handoffs_thread_waiting, + "RPC handoffs to waiting threads (vs. queue)\n"); + M("handoffs_alt_thread", m->handoffs_alt_thread, + "RPC handoffs not to first on list (avoid busy core)\n"); + M("poll_cycles", m->poll_cycles, + "Time spent polling for incoming messages\n"); + M("softirq_calls", m->softirq_calls, + "Calls to homa_softirq (i.e. # GRO pkts received)\n"); + M("softirq_cycles", m->softirq_cycles, + "Time spent in homa_softirq during SoftIRQ\n"); + M("bypass_softirq_cycles", m->bypass_softirq_cycles, + "Time spent in homa_softirq during bypass from GRO\n"); + + /* Adjust stats gathered in Linux that use rdtsc. */ + M("linux_softirq_cycles", m->linux_softirq_cycles * + (homa_clock_khz() / 1000) / (tsc_khz / 1000), + "Time spent in all Linux SoftIRQ\n"); + M("napi_cycles", m->napi_cycles * (homa_clock_khz() / 1000) / + (tsc_khz / 1000), + "Time spent in NAPI-level packet handling\n"); + M("linux_softirqd_actions", m->linux_softirqd_actions, + "SoftIRQ actions taken in the background softirqd thread\n"); + M("send_cycles", m->send_cycles, + "Time spent in homa_sendmsg for requests\n"); + M("send_calls", m->send_calls, + "Total invocations of homa_sendmsg for requests\n"); + // It is possible for us to get here at a time when a + // thread has been blocked for a long time and has + // recorded blocked_cycles, but hasn't finished the + // system call so recv_cycles hasn't been incremented + // yet. If that happens, just record 0 to prevent + // underflow errors. + delta = m->recv_cycles - m->blocked_cycles; + if (delta < 0) + delta = 0; + M("recv_cycles", delta, + "Unblocked time spent in recvmsg kernel call\n"); + M("recv_calls", m->recv_calls, + "Total invocations of recvmsg kernel call\n"); + M("blocked_cycles", m->blocked_cycles, + "Time spent blocked in homa_recvmsg\n"); + M("reply_cycles", m->reply_cycles, + "Time spent in homa_sendmsg for responses\n"); + M("reply_calls", m->reply_calls, + "Total invocations of homa_sendmsg for responses\n"); + M("abort_cycles", m->reply_cycles, + "Time spent in homa_ioc_abort kernel call\n"); + M("abort_calls", m->reply_calls, + "Total invocations of abort kernel call\n"); + M("so_set_buf_cycles", m->so_set_buf_cycles, + "Time spent in setsockopt SO_HOMA_RCVBUF\n"); + M("so_set_buf_calls", m->so_set_buf_calls, + "Total invocations of setsockopt SO_HOMA_RCVBUF\n"); + M("grant_lock_cycles", m->grant_lock_cycles, + "Time spent with grant lock locked\n"); + M("timer_cycles", m->timer_cycles, + "Time spent in homa_timer\n"); + M("timer_reap_cycles", m->timer_reap_cycles, + "Time in homa_timer spent reaping RPCs\n"); + M("data_pkt_reap_cycles", m->data_pkt_reap_cycles, + "Time in homa_data_pkt spent reaping RPCs\n"); + M("idle_time_conflicts", m->idle_time_conflicts, + "Cache conflicts when updating link_idle_time\n"); + M("nic_backlog_cycles", m->nic_backlog_cycles, + "Time when NIC queue was backlogged\n"); + M("pacer_cycles", m->pacer_cycles, + "Execution time in pacer thread\n"); + M("pacer_xmit_cycles", m->pacer_xmit_cycles, + "Time pacer spent xmitting packets (vs. polling NIC queue)\n"); + M("pacer_homa_packets", m->pacer_homa_packets, + "Homa packets transmitted by the pacer\n"); + M("pacer_homa_bytes", m->pacer_homa_bytes, + "Homa bytes transmitted by the pacer (including headers)\n"); + M("pacer_fifo_bytes", m->pacer_fifo_bytes, + "Homa bytes transmitted using FIFO priority (including headers)\n"); + M("pacer_tcp_packets", m->pacer_tcp_packets, + "TCP packets transmitted by the pacer\n"); + M("pacer_tcp_bytes", m->pacer_tcp_bytes, + "TCP bytes transmitted by the pacer (including headers)\n"); + M("pacer_help_bytes", m->pacer_help_bytes, + "Bytes transmitted via homa_qdisc_pacer_check\n"); + M("qdisc_tcp_packets", m->qdisc_tcp_packets, + "TCP packets processed by homa_qdisc\n"); + M("homa_cycles", + m->softirq_cycles + m->napi_cycles + + m->send_cycles + m->recv_cycles + + m->reply_cycles - m->blocked_cycles + + m->timer_cycles + m->nic_backlog_cycles, + "Total time in all Homa-related functions\n"); + M("resent_packets", m->resent_packets, + "DATA packets sent in response to RESENDs\n"); + M("peer_allocs", m->peer_allocs, + "New entries created in peer table\n"); + M("peer_kmalloc_errors", m->peer_kmalloc_errors, + "kmalloc failures creating peer table entries\n"); + M("peer_route_errors", m->peer_route_errors, + "Routing failures creating peer table entries\n"); + M("peer_dst_refreshes", m->peer_dst_refreshes, + "Obsolete dsts had to be regenerated\n"); + M("control_xmit_errors", m->control_xmit_errors, + "Errors sending control packets\n"); + M("data_xmit_errors", m->data_xmit_errors, + "Errors sending data packets\n"); + M("unknown_rpcs", m->unknown_rpcs, + "Non-grant packets discarded because RPC unknown\n"); + M("server_cant_create_rpcs", m->server_cant_create_rpcs, + "Packets discarded because server couldn't create RPC\n"); + M("unknown_packet_types", m->unknown_packet_types, + "Packets discarded because of unsupported type\n"); + M("short_packets", m->short_packets, + "Packets discarded because too short\n"); + M("packet_discards", m->packet_discards, + "Non-resent packets discarded because data already received\n"); + M("resent_discards", m->resent_discards, + "Resent packets discarded because data already received\n"); + M("resent_packets_used", m->resent_packets_used, + "Retransmitted packets that were actually used\n"); + M("rpc_timeouts", m->rpc_timeouts, + " RPCs aborted because peer was nonresponsive\n"); + M("server_rpc_discards", m->server_rpc_discards, + "RPCs discarded by server because of errors\n"); + M("server_rpcs_unknown", m->server_rpcs_unknown, + "RPCs aborted by server because unknown to client\n"); + M("client_lock_misses", m->client_lock_misses, + "Bucket lock misses for client RPCs\n"); + M("client_lock_miss_cycles", m->client_lock_miss_cycles, + "Time lost waiting for client bucket locks\n"); + M("server_lock_misses", m->server_lock_misses, + "Bucket lock misses for server RPCs\n"); + M("server_lock_miss_cycles", m->server_lock_miss_cycles, + "Time lost waiting for server bucket locks\n"); + M("socket_lock_misses", m->socket_lock_misses, + "Socket lock misses\n"); + M("socket_lock_miss_cycles", m->socket_lock_miss_cycles, + "Time lost waiting for socket locks\n"); + M("throttle_lock_misses", m->throttle_lock_misses, + "Throttle lock misses\n"); + M("throttle_lock_miss_cycles", m->throttle_lock_miss_cycles, + "Time lost waiting for throttle locks\n"); + M("peer_ack_lock_misses", m->peer_ack_lock_misses, + "Misses on peer ack locks\n"); + M("peer_ack_lock_miss_cycles", m->peer_ack_lock_miss_cycles, + "Time lost waiting for peer ack locks\n"); + M("grant_lock_misses", m->grant_lock_misses, + "Grant lock misses\n"); + M("grant_lock_miss_cycles", m->grant_lock_miss_cycles, + "Time lost waiting for grant lock\n"); + M("grantable_rpcs_integral", m->grantable_rpcs_integral, + "Integral of homa->num_grantable_rpcs*dt\n"); + M("grant_check_calls", m->grant_check_calls, + "Number of calls to homa_grant_check_rpc\n"); + M("grant_check_locked", m->grant_check_locked, + "Number of calls to homa_grant_check_rpc that acquired grant lock\n"); + M("grant_check_others", m->grant_check_others, + "Number of times homa_grant_check_rpc checked non-caller RPCs for grants\n"); + M("grant_check_recalcs", m->grant_check_recalcs, + "Number of times homa_grant_check_rpc updated grant priority order\n"); + M("grant_priority_bumps", m->grant_priority_bumps, + "Number of times an RPC moved up in the grant priority order\n"); + M("fifo_grant_bytes", m->fifo_grant_bytes, + "Bytes of grants issued using the FIFO mechanism\n"); + M("disabled_reaps", m->disabled_reaps, + "Reaper invocations that were disabled\n"); + M("deferred_rpc_reaps", m->deferred_rpc_reaps, + "RPCs skipped by reaper because still in use\n"); + M("reaper_calls", m->reaper_calls, + "Reaper invocations that were not disabled\n"); + M("reaper_dead_skbs", m->reaper_dead_skbs, + "Sum of hsk->dead_skbs across all reaper calls\n"); + M("reaper_active_skbs", m->reaper_active_skbs, + "RPCs skipped by reaper because of active tx skbs\n"); + M("throttle_list_adds", m->throttle_list_adds, + "Calls to homa_add_to_throttled\n"); + M("throttle_list_checks", m->throttle_list_checks, + "List elements checked in homa_add_to_throttled\n"); + M("ack_overflows", m->ack_overflows, + "Explicit ACKs sent because peer->acks was full\n"); + M("ignored_need_acks", m->ignored_need_acks, + "NEED_ACKs ignored because RPC result not yet received\n"); + M("bpage_reuses", m->bpage_reuses, + "Buffer page could be reused because ref count was zero\n"); + M("buffer_alloc_failures", m->buffer_alloc_failures, + "homa_pool_alloc_msg didn't find enough buffer space for an RPC\n"); + M("linux_pkt_alloc_bytes", m->linux_pkt_alloc_bytes, + "Bytes allocated for rx packets by NIC driver due to cache overflows\n"); + M("dropped_data_no_bufs", m->dropped_data_no_bufs, + "Data bytes dropped because app buffers full\n"); + M("gen3_handoffs", m->gen3_handoffs, + "GRO->SoftIRQ handoffs made by Gen3 balancer\n"); + M("gen3_alt_handoffs", m->gen3_alt_handoffs, + "Gen3 handoffs to secondary core (primary was busy)\n"); + M("gro_grant_bypasses", m->gro_grant_bypasses, + "Grant packets passed directly to homa_softirq by homa_gro_receive\n"); + M("gro_data_bypasses", m->gro_data_bypasses, + "Data packets passed directly to homa_softirq by homa_gro_receive\n"); + for (i = 0; i < NUM_TEMP_METRICS; i++) { + snprintf(name, sizeof(name), "temp%d", i); + M(name, m->temp[i], "Temporary use in testing\n"); + } + } + + return homa_mout.output; +} + +/** + * homa_metrics_open() - This function is invoked when /proc/net/homa_metrics is + * opened. + * @inode: The inode corresponding to the file. + * @file: Information about the open file. + * + * Return: always 0. + */ +int homa_metrics_open(struct inode *inode, struct file *file) +{ + /* Collect all of the metrics when the file is opened, and save + * these for use by subsequent reads (don't want the metrics to + * change between reads). If there are concurrent opens on the + * file, only read the metrics once, during the first open, and + * use this copy for subsequent opens, until the file has been + * completely closed. + */ + mutex_lock(&homa_mout.mutex); + if (homa_mout.active_opens == 0) + homa_metrics_print(); + homa_mout.active_opens++; + mutex_unlock(&homa_mout.mutex); + return 0; +} + +/** + * homa_metrics_read() - This function is invoked to handle read kernel calls on + * /proc/net/homa_homa_mout. + * @file: Information about the file being read. + * @buffer: Address in user space of the buffer in which data from the file + * should be returned. + * @length: Number of bytes available at @buffer. + * @offset: Current read offset within the file. + * + * Return: the number of bytes returned at @buffer. 0 means the end of the + * file was reached, and a negative number indicates an error (-errno). + */ +ssize_t homa_metrics_read(struct file *file, char __user *buffer, + size_t length, loff_t *offset) +{ + size_t copied; + + if (*offset >= homa_mout.length) + return 0; + copied = homa_mout.length - *offset; + if (copied > length) + copied = length; + if (copy_to_user(buffer, homa_mout.output + *offset, copied)) + return -EFAULT; + *offset += copied; + return copied; +} + +/** + * homa_metrics_lseek() - This function is invoked to handle seeks on + * /proc/net/homa_homa_mout. Right now seeks are ignored: the file must be + * read sequentially. + * @file: Information about the file being read. + * @offset: Distance to seek, in bytes + * @whence: Starting point from which to measure the distance to seek. + * Return: current position within file. + */ +loff_t homa_metrics_lseek(struct file *file, loff_t offset, int whence) +{ + return 0; +} + +/** + * homa_metrics_release() - This function is invoked when the last reference to + * an open /proc/net/homa_metrics is closed. It performs cleanup. + * @inode: The inode corresponding to the file. + * @file: Information about the open file. + * + * Return: always 0. + */ +int homa_metrics_release(struct inode *inode, struct file *file) +{ + mutex_lock(&homa_mout.mutex); + homa_mout.active_opens--; + mutex_unlock(&homa_mout.mutex); + return 0; +} diff --git a/homa_metrics.h b/homa_metrics.h new file mode 100644 index 00000000..ed73dba5 --- /dev/null +++ b/homa_metrics.h @@ -0,0 +1,865 @@ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ + +/* This file contains declarations related to Homa's performance metrics. */ + +#ifndef _HOMA_METRICS_H +#define _HOMA_METRICS_H + +#include +#include + +#include "homa_wire.h" + +/** + * struct homa_metrics - various performance counters kept by Homa. + * + * There is one of these structures for each core, so counters can + * be updated without worrying about synchronization or extra cache + * misses. + * + * All counters are free-running: they never reset. + */ +struct homa_metrics { + /** + * @small_msg_bytes: entry i holds the total number of bytes + * received in messages whose length is between 64*i and 64*i + 63, + * inclusive. + */ +#define HOMA_NUM_SMALL_COUNTS 64 + u64 small_msg_bytes[HOMA_NUM_SMALL_COUNTS]; + + /** + * @medium_msg_bytes: entry i holds the total number of bytes + * received in messages whose length is between 1024*i and + * 1024*i + 1023, inclusive. The first four entries are always 0 + * (small_msg_counts covers this range). + */ +#define HOMA_NUM_MEDIUM_COUNTS 128 + u64 medium_msg_bytes[HOMA_NUM_MEDIUM_COUNTS]; + + /** + * @large_msg_count: the total number of messages received whose + * length is too large to appear in medium_msg_bytes. + */ + u64 large_msg_count; + + /** + * @large_msg_bytes: the total number of bytes received in + * messages too large to be counted by medium_msg_bytes. + */ + u64 large_msg_bytes; + + /** + * @client_requests_started: cumulative count of all client RPCs + * that have been initiated on this node. + */ + u64 client_requests_started; + + /** + * @client_request_bytes_started: total number of bytes in the + * request messages for all client RPCs that have been initiated on + * this node. + */ + u64 client_request_bytes_started; + + /** + * @client_request_bytes_done: total number of bytes in request + * messages that no longer need to be transmitted (for the first time) + * either because they were transmitted or because the RPC was aborted. + * Always <= client_request_bytes_started. + */ + u64 client_request_bytes_done; + + /** + * @client_requests_done: cumulative count of all client RPCs + * whose request messages have been completely transmitted (or the RPC + * was aborted). + */ + u64 client_requests_done; + + /** + * @client_responses_started: cumulative count of all client RPCs + * for which at least one packet of the response has been received. + */ + u64 client_responses_started; + + /** + * @client_response_bytes_started: total number of bytes in + * response messages for client RPCs for which at least one byte + * of response has been received. + */ + u64 client_response_bytes_started; + + /** + * @client_response_bytes_done: cumulative count of bytes in + * @client_response_bytes_started that no longer need to be received + * (either they were received or the RPC was aborted). + */ + u64 client_response_bytes_done; + + /** + * @client_responses_done: cumulative count of all client RPCs + * that have been completed on this node (either successfully or + * with errors). + */ + u64 client_responses_done; + + /** + * @server_requests_started: cumulative count of all server RPCs + * for which at least one packet of the request has been received. + */ + u64 server_requests_started; + + /** + * @server_request_bytes_started: total number of bytes in the + * request messages for server RPCs counted by @server_reuqests_started. + */ + u64 server_request_bytes_started; + + /** + * @server_request_bytes_done: total number of bytes in + * @server_request_bytes_started that no longer need to be received + * (either they were received or the RPC was aborted). + */ + u64 server_request_bytes_done; + + /** + * @server_requests_done: cumulative count of all server RPCs + * whose request messages have been completely received (or the RPC + * was aborted). + */ + u64 server_requests_done; + + /** + * @server_responses_started: cumulative count of all server RPCs + * for which transmission of the response has begun. + */ + u64 server_responses_started; + + /** + * @server_response_bytes_started: total number of bytes in + * the messages counted by @server_responses_started. + */ + u64 server_response_bytes_started; + + /** + * @server_response_bytes_done: total number of bytes in + * @server_response_bytes_started that no longer need to be transmitted + * (either they were transmitted at least once or the RPC was aborted). + */ + u64 server_response_bytes_done; + + /** + * @server_responses_done: total number of server RPCS in + * @server_requests_started that are no longer active (either the + * response was completely sent or the RPC was aborted). + */ + u64 server_responses_done; + + /** + * @sent_msg_bytes: The total number of bytes in outbound + * messages. + */ + u64 sent_msg_bytes; + + /** + * @packets_sent: total number of packets sent for each packet type + * (entry 0 corresponds to DATA, and so on). + */ + u64 packets_sent[MAX_OP + 1 - DATA]; + + /** + * @packets_received: total number of packets received for each + * packet type (entry 0 corresponds to DATA, and so on). + */ + u64 packets_received[MAX_OP + 1 - DATA]; + + /** @priority_bytes: total bytes sent at each priority level. */ + u64 priority_bytes[HOMA_MAX_PRIORITIES]; + + /** @priority_packets: total packets sent at each priority level. */ + u64 priority_packets[HOMA_MAX_PRIORITIES]; + + /** + * @skb_allocs: total number of calls to homa_skb_alloc_tx. + */ + u64 skb_allocs; + + /** @skb_alloc_cycles: total time spent in homa_skb_alloc_tx. */ + u64 skb_alloc_cycles; + + /** + * @skb_frees: total number of sk_buffs for data packets that have + * been freed (counts normal paths only). + */ + u64 skb_frees; + + /** @skb_free_cycles: total time spent freeing sk_buffs. */ + u64 skb_free_cycles; + + /** + * @skb_page_allocs: total number of calls to homa_skb_page_alloc. + */ + u64 skb_page_allocs; + + /** @skb_page_alloc_cycles: total time spent in homa_skb_page_alloc. */ + u64 skb_page_alloc_cycles; + + /** + * @requests_received: total number of request messages received. + */ + u64 requests_received; + + /** + * @responses_received: total number of response messages received. + */ + u64 responses_received; + + /** + * @wait_none: total number of times that an incoming message was + * already waiting when recvmsg was invoked. + */ + u64 wait_none; + + /** + * @wait_fast: total number of times that a thread received an + * incoming message while polling (i.e. the message wasn't + * immediately available, but the thread never blocked). + */ + u64 wait_fast; + + /** + * @wait_block: total number of times that a thread received an + * incoming message after blocking at least once. + */ + u64 wait_block; + + /** + * @handoffs_thread_waiting: total number of times that an RPC + * was handed off to a waiting thread (vs. being queued). + */ + u64 handoffs_thread_waiting; + + /** + * @handoffs_alt_thread: total number of times that a thread other + * than the first on the list was chosen for a handoff (because the + * first thread was on a busy core). + */ + u64 handoffs_alt_thread; + + /** + * @poll_cycles: total time spent in the polling loop in + * homa_wait_for_message. + */ + u64 poll_cycles; + + /** + * @softirq_calls: total number of calls to homa_softirq (i.e., + * total number of GRO packets processed, each of which could contain + * multiple Homa packets. + */ + u64 softirq_calls; + + /** + * @softirq_cycles: total time spent executing homa_softirq when + * invoked under Linux's SoftIRQ handler. + */ + u64 softirq_cycles; + + /** + * @bypass_softirq_cycles: total time spent executing homa_softirq when + * invoked during GRO, bypassing the SoftIRQ mechanism. + */ + u64 bypass_softirq_cycles; + + /** + * @linux_softirq_cycles: total time spent executing all softirq + * activities, as measured by the linux softirq module. Only + * available with modified Linux kernels. + */ + u64 linux_softirq_cycles; + + /** + * @napi_cycles: total time spent executing all NAPI activities, as + * measured by the linux softirq module. Only available with modified + * Linux kernels. + */ + u64 napi_cycles; + + /** + * @linux_softirqd_actions: total number of times that a SoftIRQ + * action was taken in the softirqd daemon thread (slow path) rather + * than in the bottom-half SoftIRQ handler. + */ + u64 linux_softirqd_actions; + + /** + * @send_cycles: total time spent executing the homa_sendmsg kernel + * call handler to send requests. + */ + u64 send_cycles; + + /** + * @send_calls: total number of invocations of homa_semdmsg + * for requests. + */ + u64 send_calls; + + /** + * @recv_cycles: total time spent executing homa_recvmsg (including + * time when the thread is blocked). + */ + u64 recv_cycles; + + /** @recv_calls: total number of invocations of homa_recvmsg. */ + u64 recv_calls; + + /** + * @blocked_cycles: total time spent by threads in blocked state + * while executing the homa_recvmsg kernel call handler. + */ + u64 blocked_cycles; + + /** + * @reply_cycles: total time spent executing the homa_sendmsg kernel + * call handler to send responses. + */ + u64 reply_cycles; + + /** + * @reply_calls: total number of invocations of homa_semdmsg + * for responses. + */ + u64 reply_calls; + + /** + * @abort_cycles: total time spent executing the homa_ioc_abort + * kernel call handler. + */ + u64 abort_cycles; + + /** + * @abort_calls: total number of invocations of the homa_ioc_abort + * kernel call. + */ + u64 abort_calls; + + /** + * @so_set_buf_cycles: total time spent executing the homa_ioc_set_buf + * kernel call handler. + */ + u64 so_set_buf_cycles; + + /** + * @so_set_buf_calls: total number of invocations of the homa_ioc_set_buf + * kernel call. + */ + u64 so_set_buf_calls; + + /** @grant_lock_cycles: total time spent with the grant lock locked. */ + u64 grant_lock_cycles; + + /** @timer_cycles: total time spent in homa_timer. */ + u64 timer_cycles; + + /** + * @timer_reap_cycles: total time spent by homa_timer to reap dead + * RPCs. This time is included in @timer_cycles. + */ + u64 timer_reap_cycles; + + /** + * @data_pkt_reap_cycles: total time spent by homa_data_pkt to reap + * dead RPCs. + */ + u64 data_pkt_reap_cycles; + + /** + * @idle_time_conflicts: total number of times that an update to + * link_idle_time in homa_qdisc_update_link_idle failed because + * of a conflicting access. + */ + __u64 idle_time_conflicts; + + /** + * @nic_backlog_cycles: total amount of time when there were packets + * waiting to be transmitted in homa_qdisc because the NIC queue was + * too long. + */ + u64 nic_backlog_cycles; + + /** + * @pacer_cycles: total execution time in the pacer thread (excluding + * blocked time). + */ + u64 pacer_cycles; + + /** + * @pacer_xmit_cycles: total time spent by the pacer actually + * transmitting packets (as opposed to polling waiting for the + * NIC queue to subside). + */ + u64 pacer_xmit_cycles; + + /** + * @pacer_homa_packets: total number of Homa packets that were + * transmitted by homa_qdisc_pacer (they were deferred because of + * NIC queue overload). + */ + u64 pacer_homa_packets; + + /** + * @pacer_homa_bytes: total number of bytes in Homa packets that were + * transmitted by homa_qdisc_pacer (they were deferred because of + * NIC queue overload). + */ + u64 pacer_homa_bytes; + + /** + * @pacer_fifo_bytes: total number of bytes in Homa packets that + * were transmitted using FIFO priority rather than SRPC. + */ + u64 pacer_fifo_bytes; + + /** + * @pacer_tcp_packets: total number of TCP packets that were + * transmitted by homa_qdisc_pacer (they were deferred because of + * NIC queue overload). + */ + u64 pacer_tcp_packets; + + /** + * @pacer_tcp_bytes: total number of bytes in TCP packets that were + * transmitted by homa_qdisc_pacer (they were deferred because of + * NIC queue overload). + */ + u64 pacer_tcp_bytes; + + /** + * @pacer_help_bytes: bytes that the pacer transmitted via calls to + * homa_qdisc_pacer_check (presumably because the pacer thread + * wasn't keeping up). Includes both TCP and Homa packets as well as + * header bytes. + */ + u64 pacer_help_bytes; + + /** + * @qdisc_tcp_packets: total number of TCP packets that passed through + * homa_qdisc; includes packets that were transmitted immediately as + * well as those that were deferred. + */ + u64 qdisc_tcp_packets; + + /** + * @resent_packets: total number of data packets issued in response to + * RESEND packets. + */ + u64 resent_packets; + + /** + * @peer_allocs: total # of new entries created in Homa's + * peer table (this value doesn't increment if the desired peer is + * found in the entry in its hash chain). + */ + u64 peer_allocs; + + /** + * @peer_kmalloc_errors: total number of times homa_peer_get + * returned an error because it couldn't allocate memory for a new + * peer. + */ + u64 peer_kmalloc_errors; + + /** + * @peer_route_errors: total number of times homa_peer_get + * returned an error because it couldn't create a route to the peer. + */ + u64 peer_route_errors; + + /** + * @peer_dst_refreshes: total number of times that the dst for a + * peer had to be regenerated because the existing one had become + * obsolete. + */ + u64 peer_dst_refreshes; + + /** + * @control_xmit_errors: total number of times ip_queue_xmit + * failed when transmitting a control packet. + */ + u64 control_xmit_errors; + + /** + * @data_xmit_errors: total number of times ip_queue_xmit + * failed when transmitting a data packet. + */ + u64 data_xmit_errors; + + /** + * @unknown_rpcs: total number of times an incoming packet was + * discarded because it referred to a nonexistent RPC. Doesn't + * count grant packets received by servers (since these are + * fairly common). + */ + u64 unknown_rpcs; + + /** + * @server_cant_create_rpcs: total number of times a server discarded + * an incoming packet because it couldn't create a homa_rpc object. + */ + u64 server_cant_create_rpcs; + + /** + * @unknown_packet_types: total number of times a packet was discarded + * because its type wasn't one of the supported values. + */ + u64 unknown_packet_types; + + /** + * @short_packets: total number of times a packet was discarded + * because it was too short to hold all the required information. + */ + u64 short_packets; + + /** + * @packet_discards: total number of times a normal (non-retransmitted) + * packet was discarded because all its data had already been received. + */ + u64 packet_discards; + + /** + * @resent_discards: total number of times a retransmitted packet + * was discarded because its data had already been received. + */ + u64 resent_discards; + + /** + * @resent_packets_used: total number of times a resent packet was + * actually incorporated into the message at the target (i.e. it + * wasn't redundant). + */ + u64 resent_packets_used; + + /** + * @rpc_timeouts: total number of times an RPC (either client or + * server) was aborted because the peer was nonresponsive. + */ + u64 rpc_timeouts; + + /** + * @server_rpc_discards: total number of times an RPC was aborted on + * the server side because of a timeout. + */ + u64 server_rpc_discards; + + /** + * @server_rpcs_unknown: total number of times an RPC was aborted on + * the server side because it is no longer known to the client. + */ + u64 server_rpcs_unknown; + + /** + * @client_lock_misses: total number of times that Homa had to wait + * to acquire a client bucket lock. + */ + u64 client_lock_misses; + + /** + * @client_lock_miss_cycles: total time spent waiting for client + * bucket lock misses. + */ + u64 client_lock_miss_cycles; + + /** + * @server_lock_misses: total number of times that Homa had to wait + * to acquire a server bucket lock. + */ + u64 server_lock_misses; + + /** + * @server_lock_miss_cycles: total time spent waiting for server + * bucket lock misses. + */ + u64 server_lock_miss_cycles; + + /** + * @socket_lock_miss_cycles: total time spent waiting for socket + * lock misses. + */ + u64 socket_lock_miss_cycles; + + /** + * @socket_lock_misses: total number of times that Homa had to wait + * to acquire a socket lock. + */ + u64 socket_lock_misses; + + /** + * @throttle_lock_miss_cycles: total time spent waiting for throttle + * lock misses. + */ + u64 throttle_lock_miss_cycles; + + /** + * @throttle_lock_misses: total number of times that Homa had to wait + * to acquire the throttle lock. + */ + u64 throttle_lock_misses; + + /** + * @peer_ack_lock_miss_cycles: total time spent waiting for peer lock misses. + */ + u64 peer_ack_lock_miss_cycles; + + /** + * @peer_ack_lock_misses: total number of times that Homa had to wait + * to acquire the lock used for managing acks for a peer. + */ + u64 peer_ack_lock_misses; + + /** + * @grant_lock_miss_cycles: total time spent waiting for grant lock + * misses. + */ + u64 grant_lock_miss_cycles; + + /** + * @grant_lock_misses: total number of times that Homa had to wait + * to acquire the grant lock. + */ + u64 grant_lock_misses; + + /** + * @grantable_rpcs_integral: cumulative sum of time_delta*grantable, + * where time_delta is in nanoseconds and grantable is the value of + * homa->num_grantable_rpcs over that time period. + */ + u64 grantable_rpcs_integral; + + /** + * @grant_check_calls: cumulative number of times homa_grant_check_rpc + * has been invoked. + */ + u64 grant_check_calls; + + /** + * @grant_check_locked: cumulative number of times an invocation of + * homa_grant_check_rpc acquired the grant lock at least once. + */ + u64 grant_check_locked; + + /** + * @grant_check_recalcs: cumulative number of times that + * homa_grant_check_rpc verified and/or adjusted the priority of + * active RPCs. + */ + u64 grant_check_recalcs; + + /** + * @grant_check_others: cumulative number of times homa_grant_check_rpc + * checked other RPCs besides the invoking one for potential grants. + */ + u64 grant_check_others; + + /** + * @grant_priority_bumps: cumulative number of times the grant priority + * of an RPC has increased above its next-higher-priority neighbor. + */ + u64 grant_priority_bumps; + + /** + * @fifo_grant_bytes: total number of bytes of grants issued via + * the FIFO granting mechanism + */ + u64 fifo_grant_bytes; + + /** + * @disabled_reaps: total number of times that the reaper couldn't + * run at all because it was disabled. + */ + u64 disabled_reaps; + + /** + * @deferred_rpc_reaps: total number of times that the reaper skipped + * an RPC because it was still in use elsewhere. + */ + u64 deferred_rpc_reaps; + + /** + * @reaper_calls: total number of times that the reaper was invoked + * and was not disabled. + */ + u64 reaper_calls; + + /** + * @reaper_dead_skbs: incremented by hsk->dead_skbs each time that + * reaper_calls is incremented. + */ + u64 reaper_dead_skbs; + + /** + * @reaper_active_skbs: total number of times homa_rpc_reap had to skip + * an RPC because one of its tx skb's was still in the transmit + * pipeline. + */ + u64 reaper_active_skbs; + + /** + * @throttle_list_adds: total number of calls to homa_add_to_throttled. + */ + u64 throttle_list_adds; + + /** + * @throttle_list_checks: number of list elements examined in + * calls to homa_add_to_throttled. + */ + u64 throttle_list_checks; + + /** + * @ack_overflows: total number of times that homa_peer_add_ack + * found insufficient space for the new id and hence had to send an + * ACK message. + */ + u64 ack_overflows; + + /** + * @ignored_need_acks: total number of times that a NEED_ACK packet + * was ignored because the RPC's result hadn't been fully received. + */ + u64 ignored_need_acks; + + /** + * @bpage_reuses: total number of times that, when an owned page + * reached the end, it could be reused because all existing + * allocations had been released. + */ + u64 bpage_reuses; + + /** + * @buffer_alloc_failures: total number of times that + * homa_pool_alloc_msg was unable to allocate buffer space for + * an incoming message. + */ + u64 buffer_alloc_failures; + + /** + * @linux_pkt_alloc_bytes: total bytes allocated in new packet buffers + * by the NIC driver because of packet cache underflows. + */ + u64 linux_pkt_alloc_bytes; + + /** + * @dropped_data_no_bufs: total bytes of incoming data dropped because + * there was no application buffer space available. + */ + u64 dropped_data_no_bufs; + + /** + * @gen3_handoffs: total number of handoffs from GRO to SoftIRQ made + * by Gen3 load balancer. + */ + u64 gen3_handoffs; + + /** + * @gen3_alt_handoffs: total number of GRO->SoftIRQ handoffs that + * didn't choose the primary SoftIRQ core because it was busy with + * app threads. + */ + u64 gen3_alt_handoffs; + + /** + * @gro_grant_bypasses: total number of GRANT packets passed directly + * to homa_softirq by homa_gro_receive, bypassing the normal SoftIRQ + * mechanism (triggered by HOMA_GRO_FAST_GRANTS). + */ + u64 gro_grant_bypasses; + + /** + * @gro_data_bypasses: total number of DATA packets passed directly + * to homa_softirq by homa_gro_receive, bypassing the normal SoftIRQ + * mechanism (triggered by HOMA_GRO_SHORT_BYPASS). + */ + u64 gro_data_bypasses; + + /** @temp: For temporary use during testing. */ +#define NUM_TEMP_METRICS 10 + u64 temp[NUM_TEMP_METRICS]; +}; + +DECLARE_PER_CPU(struct homa_metrics, homa_metrics); + +/** + * struct homa_metrics_output - Holds global information used to export metrics + * information through a file in /proc. + */ +struct homa_metrics_output { + /** + * @mutex: Used to synchronize accesses to @active_opens + * and updates to @output. + */ + struct mutex mutex; + + /** + * @output: a human-readable string containing recent values + * for all the Homa performance metrics, as generated by + * homa_append_metric. This string is kmalloc-ed; NULL means + * homa_append_metric has never been called. + */ + char *output; + + /** @capacity: number of bytes available at @output. */ + size_t capacity; + + /** + * @length: current length of the string in @output, not including + * terminating NULL character. + */ + size_t length; + + /** + * @active_opens: number of open struct files that currently exist + * for the metrics file in /proc. + */ + int active_opens; + + /** + * @dir_entry: Used to remove /proc/net/homa_metrics when the + * module is unloaded. + */ + struct proc_dir_entry *dir_entry; +}; + +/** + * homa_metrics_per_cpu() - Return the metrics structure for the current core. + * This is unsynchronized and doesn't guarantee non-preemption. + * Return: see above + */ +static inline struct homa_metrics *homa_metrics_per_cpu(void) +{ + return &per_cpu(homa_metrics, raw_smp_processor_id()); +} + +/* It isn't necessary to disable preemption here, because we don't need + * perfect synchronization: if the invoking thread is moved to a + * different core and races with an INC_METRIC there, the worst that + * happens is that one of the INC_METRICs is lost, which isn't a big deal. + */ +#define INC_METRIC(metric, count) (per_cpu(homa_metrics, \ + raw_smp_processor_id()).metric += (count)) + +extern struct homa_metrics_output homa_mout; + +void homa_metric_append(const char *name, u64 value, const char *format, ...); +void homa_metrics_end(void); +int homa_metrics_init(void); +loff_t homa_metrics_lseek(struct file *file, loff_t offset, + int whence); +int homa_metrics_open(struct inode *inode, struct file *file); +char *homa_metrics_print(void); +ssize_t homa_metrics_read(struct file *file, char __user *buffer, + size_t length, loff_t *offset); +int homa_metrics_release(struct inode *inode, struct file *file); +int homa_proc_read_metrics(char *buffer, char **start, off_t offset, + int count, int *eof, void *data); + +#endif /* _HOMA_METRICS_H */ diff --git a/homa_offload.c b/homa_offload.c index 15651b20..812b567c 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -1,23 +1,17 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file implements GSO (Generic Segmentation Offload) and GRO (Generic * Receive Offload) for Homa. */ #include "homa_impl.h" +#include "homa_offload.h" +#include "homa_pacer.h" +#include "homa_qdisc.h" + +DEFINE_PER_CPU(struct homa_offload_core, homa_offload_core); + +#define CORES_TO_CHECK 4 static const struct net_offload homa_offload = { .callbacks = { @@ -27,7 +21,20 @@ static const struct net_offload homa_offload = { }, }; -extern struct homa *homa; +#ifndef __STRIP__ /* See strip.py */ +/* Pointers to TCP's net_offload structures. NULL means homa_gro_hook_tcp + * hasn't been called yet. + */ +static const struct net_offload *tcp_net_offload; +static const struct net_offload *tcp6_net_offload; + +/* + * Identical to *tcp_net_offload except that the gro_receive function + * has been replaced. + */ +static struct net_offload hook_tcp_net_offload; +static struct net_offload hook_tcp6_net_offload; +#endif /* See strip.py */ /** * homa_offload_init() - Invoked to enable GRO and GSO. Typically invoked @@ -36,8 +43,28 @@ extern struct homa *homa; */ int homa_offload_init(void) { + int i; + + for (i = 0; i < nr_cpu_ids; i++) { + struct homa_offload_core *offload_core; + int j; + + offload_core = &per_cpu(homa_offload_core, i); + offload_core->last_active = 0; + offload_core->last_gro = 0; + atomic_set(&offload_core->softirq_backlog, 0); + offload_core->softirq_offset = 0; + offload_core->gen3_softirq_cores[0] = i ^ 1; + for (j = 1; j < NUM_GEN3_SOFTIRQ_CORES; j++) + offload_core->gen3_softirq_cores[j] = -1; + offload_core->last_app_active = 0; + offload_core->held_skb = NULL; + offload_core->held_bucket = 0; + } + int res1 = inet_add_offload(&homa_offload, IPPROTO_HOMA); int res2 = inet6_add_offload(&homa_offload, IPPROTO_HOMA); + return res1 ? res1 : res2; } @@ -49,11 +76,93 @@ int homa_offload_init(void) */ int homa_offload_end(void) { - int res1 = inet_del_offload(&homa_offload, IPPROTO_HOMA); int res2 = inet6_del_offload(&homa_offload, IPPROTO_HOMA); + int res1 = inet_del_offload(&homa_offload, IPPROTO_HOMA); + return res1 ? res1 : res2; } +#ifndef __STRIP__ /* See strip.py */ +#endif /* See strip.py */ +/** + * homa_gro_hook_tcp() - Arranges for TCP gro_receive calls to be + * mediated by this file, so that Homa-over-TCP packets can be retrieved + * and funneled through Homa. + */ +void homa_gro_hook_tcp(void) +{ + if (tcp_net_offload) + return; + + pr_notice("Homa setting up TCP hijacking\n"); + rcu_read_lock(); + tcp_net_offload = rcu_dereference(inet_offloads[IPPROTO_TCP]); + hook_tcp_net_offload = *tcp_net_offload; + hook_tcp_net_offload.callbacks.gro_receive = homa_tcp_gro_receive; + inet_offloads[IPPROTO_TCP] = (struct net_offload __rcu *) + &hook_tcp_net_offload; + + tcp6_net_offload = rcu_dereference(inet6_offloads[IPPROTO_TCP]); + hook_tcp6_net_offload = *tcp6_net_offload; + hook_tcp6_net_offload.callbacks.gro_receive = homa_tcp_gro_receive; + inet6_offloads[IPPROTO_TCP] = (struct net_offload __rcu *) + &hook_tcp6_net_offload; + rcu_read_unlock(); +} + +/** + * homa_gro_unhook_tcp() - Reverses the effects of a previous call to + * homa_hook_tcp_gro, so that TCP packets are now passed directly to + * Tcp's gro_receive function without mediation. + */ +void homa_gro_unhook_tcp(void) +{ + if (!tcp_net_offload) + return; + pr_notice("Homa cancelling TCP hijacking\n"); + inet_offloads[IPPROTO_TCP] = (struct net_offload __rcu *) + tcp_net_offload; + tcp_net_offload = NULL; + inet6_offloads[IPPROTO_TCP] = (struct net_offload __rcu *) + tcp6_net_offload; + tcp6_net_offload = NULL; +} + +/** + * homa_tcp_gro_receive() - Invoked instead of TCP's normal gro_receive function + * when hooking is enabled. Identifies Homa-over-TCP packets and passes them + * to Homa; sends real TCP packets to TCP's gro_receive function. + * @held_list: Pointer to header for list of packets that are being + * held for possible GRO merging. + * @skb: The newly arrived packet. + */ +struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, + struct sk_buff *skb) +{ + struct homa_common_hdr *h = (struct homa_common_hdr *) + skb_transport_header(skb); + + // tt_record4("homa_tcp_gro_receive got type 0x%x, flags 0x%x, " + // "urgent 0x%x, id %d", h->type, h->flags, + // ntohs(h->urgent), homa_local_id(h->sender_id)); + if (h->flags != HOMA_TCP_FLAGS || + ntohs(h->urgent) != HOMA_TCP_URGENT) + return tcp_net_offload->callbacks.gro_receive(held_list, skb); + + /* Change the packet's IP protocol to Homa so that it will get + * dispatched directly to Homa in the future. + */ + if (skb_is_ipv6(skb)) { + ipv6_hdr(skb)->nexthdr = IPPROTO_HOMA; + } else { + ip_hdr(skb)->check = ~csum16_add(csum16_sub(~ip_hdr(skb)->check, + htons(ip_hdr(skb)->protocol)), + htons(IPPROTO_HOMA)); + ip_hdr(skb)->protocol = IPPROTO_HOMA; + } + return homa_gro_receive(held_list, skb); +} + /** * homa_set_softirq_cpu() - Arrange for SoftIRQ processing of a packet to * occur on a specific core (creates a socket flow table entry for the core, @@ -62,22 +171,54 @@ int homa_offload_end(void) * @cpu: Index of core to which the packet should be directed for * SoftIRQ processing. */ -static inline void homa_set_softirq_cpu(struct sk_buff *skb, int cpu) +static void homa_set_softirq_cpu(struct sk_buff *skb, int cpu) { struct rps_sock_flow_table *sock_flow_table; int hash; - sock_flow_table = rcu_dereference(rps_sock_flow_table); - if (sock_flow_table == NULL) - return; - hash = cpu + rps_cpu_mask + 1; - if (sock_flow_table->ents[hash] != hash) { - rcu_read_lock(); - sock_flow_table = rcu_dereference(rps_sock_flow_table); - sock_flow_table->ents[hash] = hash; - rcu_read_unlock(); + rcu_read_lock(); + sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); + if (sock_flow_table) { + hash = cpu + net_hotdata.rps_cpu_mask + 1; + if (sock_flow_table->ents[hash] != hash) { + sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); + sock_flow_table->ents[hash] = hash; + } + __skb_set_sw_hash(skb, hash, false); + } + rcu_read_unlock(); +} + +/** + * homa_send_ipis() - If there are any interprocessor interrupts pending + * from this core to others (for packets queued for SoftIRQ processing) + * issue those interrupts now. This function is needed because calling + * netif_receive_skb doesn't actually issue IPIs; it queues them until + * all NAPI processing is finished, and this could be a long time if a + * lot more packets are available for processing. + */ +void homa_send_ipis(void) +{ +#if defined(CONFIG_RPS) && !defined(__UNIT_TEST__) + /* This function duplicates the code from net_rps_send_ipi because + * we can't call that function from here. + */ + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + struct softnet_data *remsd; + + local_irq_disable(); + remsd = sd->rps_ipi_list; + sd->rps_ipi_list = NULL; + local_irq_enable(); + + while (remsd) { + struct softnet_data *next = remsd->rps_ipi_next; + + if (cpu_online(remsd->cpu)) + smp_call_function_single_async(remsd->cpu, &remsd->csd); + remsd = next; } - __skb_set_sw_hash(skb, hash, false); +#endif } /** @@ -88,17 +229,18 @@ static inline void homa_set_softirq_cpu(struct sk_buff *skb, int cpu) * Return: A list of packets, or NULL if for the packet couldn't be split. */ struct sk_buff *homa_gso_segment(struct sk_buff *skb, - netdev_features_t features) + netdev_features_t features) { struct sk_buff *segs; + tt_record2("homa_gso_segment invoked, frags %d, headlen %d", - skb_shinfo(skb)->nr_frags, skb_headlen(skb)); + skb_shinfo(skb)->nr_frags, skb_headlen(skb)); /* This is needed to separate header info (which is replicated * in each segment) from data, which is divided among the segments. */ - __skb_pull(skb, sizeof(struct data_header) - - sizeof(struct data_segment)); + __skb_pull(skb, sizeof(struct homa_data_hdr) + - sizeof(struct homa_seg_hdr)); segs = skb_segment(skb, features); /* Set incrementing ids in each of the segments (mimics behavior @@ -107,7 +249,8 @@ struct sk_buff *homa_gso_segment(struct sk_buff *skb, if (ip_hdr(segs)->version == 4) { struct sk_buff *seg; int i = 0; - for (seg = segs; seg != NULL; seg = seg->next) { + + for (seg = segs; seg; seg = seg->next) { ip_hdr(seg)->id = htons(i); i++; } @@ -133,7 +276,7 @@ struct sk_buff *homa_gso_segment(struct sk_buff *skb, * passed up the stack immediately. */ struct sk_buff *homa_gro_receive(struct list_head *held_list, - struct sk_buff *skb) + struct sk_buff *skb) { /* This function will do one of the following things: * 1. Merge skb with a packet in gro_list by appending it to @@ -145,59 +288,77 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, * gro_list by the caller, so it will be considered for merges * in the future. */ -// int hdr_offset, hdr_end; - struct sk_buff *held_skb; + struct homa *homa = homa_net(dev_net(skb->dev))->homa; + u64 saved_softirq_metric, softirq_cycles; + struct homa_offload_core *offload_core; struct sk_buff *result = NULL; - struct homa_core *core = homa_cores[raw_smp_processor_id()]; - __u32 hash; - __u64 saved_softirq_metric, softirq_cycles; - struct data_header *h_new = (struct data_header *) - skb_transport_header(skb); + struct homa_data_hdr *h_new; + u64 *softirq_cycles_metric; + struct sk_buff *held_skb; + u64 now = homa_clock(); int priority; - __u32 saddr; + u32 saddr; + u32 hash; + int busy; + + if (!homa_make_header_avl(skb)) + tt_record("homa_gro_receive couldn't pull enough data from packet"); + + // if (homa_drop_packet(homa)) { + // kfree_skb(skb); + // return ERR_PTR(-EINPROGRESS); + // } + + h_new = (struct homa_data_hdr *)skb_transport_header(skb); + offload_core = &per_cpu(homa_offload_core, smp_processor_id()); + busy = (now - offload_core->last_gro) < homa->gro_busy_cycles; + offload_core->last_active = now; if (skb_is_ipv6(skb)) { priority = ipv6_hdr(skb)->priority; saddr = ntohl(ipv6_hdr(skb)->saddr.in6_u.u6_addr32[3]); } else { - priority = ((struct iphdr *) skb_network_header(skb))->tos >> 5; + priority = ((struct iphdr *)skb_network_header(skb))->tos >> 5; saddr = ntohl(ip_hdr(skb)->saddr); } -// The test below is overly conservative except for data packets. -// if (!pskb_may_pull(skb, 64)) -// tt_record("homa_gro_receive can't pull enough data " -// "from packet for trace"); - if (h_new->common.type == DATA) - tt_record4("homa_gro_receive got packet from 0x%x " - "id %llu, offset %d, priority %d", - saddr, homa_local_id(h_new->common.sender_id), - ntohl(h_new->seg.offset), priority); - else if (h_new->common.type == GRANT) { - tt_record4("homa_gro_receive got grant from 0x%x " - "id %llu, offset %d, priority %d", - saddr, homa_local_id(h_new->common.sender_id), - ntohl(((struct grant_header *) h_new)->offset), - priority); + if (h_new->common.type == DATA) { + if (h_new->seg.offset == (__force __be32)-1) { + tt_record2("homa_gro_receive replaced offset %d with %d", + ntohl(h_new->seg.offset), + ntohl(h_new->common.sequence)); + h_new->seg.offset = h_new->common.sequence; + } + tt_record4("homa_gro_receive got packet from 0x%x id %llu, offset %d, priority %d", + saddr, homa_local_id(h_new->common.sender_id), + ntohl(h_new->seg.offset), priority); + if (homa_data_len(skb) == ntohl(h_new->message_length) && + (homa->gro_policy & HOMA_GRO_SHORT_BYPASS) && + !busy) { + INC_METRIC(gro_data_bypasses, 1); + goto bypass; + } + } else if (h_new->common.type == GRANT) { + tt_record4("homa_gro_receive got grant from 0x%x id %llu, offset %d, priority %d", + saddr, homa_local_id(h_new->common.sender_id), + ntohl(((struct homa_grant_hdr *)h_new)->offset), + priority); /* The following optimization handles grants here at NAPI * level, bypassing the SoftIRQ mechanism (and avoiding the * delay of handing off to a different core). This makes * a significant difference in throughput for large * messages, especially when the system is loaded. */ - if (homa->gro_policy & HOMA_GRO_FAST_GRANTS) + if ((homa->gro_policy & HOMA_GRO_FAST_GRANTS) && !busy) { + INC_METRIC(gro_grant_bypasses, 1); goto bypass; - } else - tt_record4("homa_gro_receive got packet from 0x%x " - "id %llu, type 0x%x, priority %d", - saddr, homa_local_id(h_new->common.sender_id), - h_new->common.type, priority); - - core->last_active = get_cycles(); - - if ((homa->gro_policy & HOMA_GRO_BYPASS) - || ((homa->gro_policy & HOMA_GRO_SHORT_BYPASS) - && (skb->len < 1400))) - goto bypass; + } +#ifndef __STRIP__ /* See strip.py */ + } else { + tt_record4("homa_gro_receive got packet from 0x%x id %llu, type 0x%x, priority %d", + saddr, homa_local_id(h_new->common.sender_id), + h_new->common.type, priority); +#endif /* See strip.py */ + } /* The GRO mechanism tries to separate packets onto different * gro_lists by hash. This is bad for us, because we want to batch @@ -206,20 +367,48 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, * core added a Homa packet (if there is such a list). */ hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); - if (core->held_skb) { - /* Reverse-engineer the location of the napi_struct, so we + if (offload_core->held_skb) { + /* Reverse-engineer the location of the gro_node, so we * can verify that held_skb is still valid. */ struct gro_list *gro_list = container_of(held_list, struct gro_list, list); +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 15, 0) struct napi_struct *napi = container_of(gro_list, struct napi_struct, gro_hash[hash]); +#else + struct gro_node *gro_node = container_of(gro_list, + struct gro_node, hash[hash]); +#endif - /* Make sure that core->held_skb is on the list. */ + /* Must verify that offload_core->held_skb points to a packet on + * the list, and that the packet is a Homa packet. + * homa_gro_complete isn't always invoked before removing + * packets from the list, so offload_core->held_skb could be a + * dangling pointer (or the skb could have been reused for + * some other protocol). + */ list_for_each_entry(held_skb, - &napi->gro_hash[core->held_bucket].list, list) { - if (held_skb != core->held_skb) +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 15, 0) + &napi->gro_hash[offload_core->held_bucket].list, +#else + &gro_node->hash[offload_core->held_bucket].list, +#endif + list) { + int protocol; + + if (held_skb != offload_core->held_skb) + continue; + if (skb_is_ipv6(held_skb)) + protocol = ipv6_hdr(held_skb)->nexthdr; + else + protocol = ip_hdr(held_skb)->protocol; + if (protocol != IPPROTO_HOMA) { + tt_record3("homa_gro_receive held_skb 0x%0x%0x isn't Homa: protocol %d", + tt_hi(held_skb), tt_lo(held_skb), + protocol); continue; + } /* Aggregate skb into held_skb. We don't update the * length of held_skb because we'll eventually split @@ -239,17 +428,25 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, * returning skb as result is no longer * sufficient (as of 5.4.80) to push it up * the stack; the packet just gets queued on - * napi->rx_list. This code basically steals + * gro_node->rx_list. This code basically steals * the packet from dev_gro_receive and * pushes it upward. */ skb_list_del_init(held_skb); homa_gro_complete(held_skb, 0); netif_receive_skb(held_skb); - napi->gro_hash[core->held_bucket].count--; - if (napi->gro_hash[core->held_bucket].count == 0) - __clear_bit(core->held_bucket, - &napi->gro_bitmask); + homa_send_ipis(); +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 15, 0) + napi->gro_hash[offload_core->held_bucket].count--; + if (napi->gro_hash[offload_core->held_bucket].count == 0) + __clear_bit(offload_core->held_bucket, + &napi->gro_bitmask); +#else + gro_node->hash[offload_core->held_bucket].count--; + if (gro_node->hash[offload_core->held_bucket].count == 0) + __clear_bit(offload_core->held_bucket, + &gro_node->bitmask); +#endif result = ERR_PTR(-EINPROGRESS); } goto done; @@ -264,33 +461,140 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, * means we aren't heavily loaded; if batching does occur, * homa_gro_complete will pick a different core). */ - core->held_skb = skb; - core->held_bucket = hash; + offload_core->held_skb = skb; + offload_core->held_bucket = hash; if (likely(homa->gro_policy & HOMA_GRO_SAME_CORE)) - homa_set_softirq_cpu(skb, raw_smp_processor_id()); + homa_set_softirq_cpu(skb, smp_processor_id()); - done: - homa_check_pacer(homa, 1); +done: + homa_pacer_check(homa->pacer); + homa_qdisc_pacer_check(homa); + offload_core->last_gro = homa_clock(); return result; - bypass: - saved_softirq_metric = homa_cores[raw_smp_processor_id()] - ->metrics.softirq_cycles; +bypass: + /* Record SoftIRQ cycles in a different metric to reflect that + * they happened during bypass. + */ + softirq_cycles_metric = &homa_metrics_per_cpu()->softirq_cycles; + saved_softirq_metric = *softirq_cycles_metric; homa_softirq(skb); - softirq_cycles = homa_cores[raw_smp_processor_id()] - ->metrics.softirq_cycles - saved_softirq_metric; - homa_cores[raw_smp_processor_id()]->metrics.softirq_cycles - = saved_softirq_metric; + softirq_cycles = *softirq_cycles_metric - saved_softirq_metric; + *softirq_cycles_metric = saved_softirq_metric; INC_METRIC(bypass_softirq_cycles, softirq_cycles); + offload_core->last_gro = homa_clock(); - /* Record SoftIRQ cycles in a different metric to reflect that - * they happened during bypass. + /* This return value indicates that we have freed skb. */ + return ERR_PTR(-EINPROGRESS); +} + +/** + * homa_gro_gen2() - When the Gen2 load balancer is being used this function + * is invoked by homa_gro_complete to choose a core to handle SoftIRQ for a + * batch of packets + * @homa: Overall information about the Homa transport. + * @skb: First in a group of packets that are ready to be passed to SoftIRQ. + * Information will be updated in the packet so that Linux will + * direct it to the chosen core. + */ +void homa_gro_gen2(struct homa *homa, struct sk_buff *skb) +{ + /* Scan the next several cores in order after the current core, + * trying to find one that is not already busy with SoftIRQ processing, + * and that doesn't appear to be active with NAPI/GRO processing + * either. If there is no such core, just rotate among the next + * cores. See balance.txt for overall design information on load + * balancing. */ + struct homa_data_hdr *h = + (struct homa_data_hdr *)skb_transport_header(skb); + struct homa_offload_core *offload_core; + int this_core = smp_processor_id(); + int candidate = this_core; + u64 now = homa_clock(); + int i; + for (i = CORES_TO_CHECK; i > 0; i--) { + candidate++; + if (unlikely(candidate >= nr_cpu_ids)) + candidate = 0; + offload_core = &per_cpu(homa_offload_core, candidate); + if (atomic_read(&offload_core->softirq_backlog) > 0) + continue; + if ((offload_core->last_gro + homa->busy_cycles) > now) + continue; + tt_record3("homa_gro_gen2 chose core %d for id %d offset %d", + candidate, homa_local_id(h->common.sender_id), + ntohl(h->seg.offset)); + break; + } + if (i <= 0) { + /* All of the candidates appear to be busy; just + * rotate among them. + */ + int offset = per_cpu(homa_offload_core, this_core).softirq_offset; - /* This return value indicates that we have freed skb. */ - return ERR_PTR(-EINPROGRESS); + offset += 1; + if (offset > CORES_TO_CHECK) + offset = 1; + per_cpu(homa_offload_core, this_core).softirq_offset = offset; + candidate = this_core + offset; + while (candidate >= nr_cpu_ids) + candidate -= nr_cpu_ids; + tt_record3("homa_gro_gen2 chose core %d for id %d offset %d (all cores busy)", + candidate, homa_local_id(h->common.sender_id), + ntohl(h->seg.offset)); + } + atomic_inc(&per_cpu(homa_offload_core, candidate).softirq_backlog); + homa_set_softirq_cpu(skb, candidate); +} +/** + * homa_gro_gen3() - When the Gen3 load balancer is being used this function + * is invoked by homa_gro_complete to choose a core to handle SoftIRQ for a + * batch of packets + * @homa: Overall information about the Homa transport. + * @skb: First in a group of packets that are ready to be passed to SoftIRQ. + * Information will be updated in the packet so that Linux will + * direct it to the chosen core. + */ +void homa_gro_gen3(struct homa *homa, struct sk_buff *skb) +{ + /* See balance.txt for overall design information on the Gen3 + * load balancer. + */ + struct homa_data_hdr *h = + (struct homa_data_hdr *)skb_transport_header(skb); + u64 now, busy_time; + int *candidates; + int i, core; + + candidates = per_cpu(homa_offload_core, + smp_processor_id()).gen3_softirq_cores; + now = homa_clock(); + busy_time = now - homa->busy_cycles; + + core = candidates[0]; + for (i = 0; i < NUM_GEN3_SOFTIRQ_CORES; i++) { + int candidate = candidates[i]; + + if (candidate < 0) + break; + if (per_cpu(homa_offload_core, candidate).last_app_active + < busy_time) { + core = candidate; + break; + } + } + homa_set_softirq_cpu(skb, core); + per_cpu(homa_offload_core, core).last_active = now; + tt_record4("homa_gro_gen3 chose core %d for id %d, offset %d, delta %d", + core, homa_local_id(h->common.sender_id), + ntohl(h->seg.offset), + now - per_cpu(homa_offload_core, core).last_app_active); + INC_METRIC(gen3_handoffs, 1); + if (core != candidates[0]) + INC_METRIC(gen3_alt_handoffs, 1); } /** @@ -305,68 +609,24 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, */ int homa_gro_complete(struct sk_buff *skb, int hoffset) { - struct common_header *h = (struct common_header *) - skb_transport_header(skb); - struct data_header *d = (struct data_header *) h; -// tt_record4("homa_gro_complete type %d, id %d, offset %d, count %d", -// h->type, homa_local_id(h->sender_id), ntohl(d->seg.offset), -// NAPI_GRO_CB(skb)->count); + struct homa_data_hdr *h = + (struct homa_data_hdr *)skb_transport_header(skb); + struct homa *homa = homa_net(dev_net(skb->dev))->homa; -#define CORES_TO_CHECK 4 - if (homa->gro_policy & HOMA_GRO_IDLE_NEW) { - /* Pick a specific core to handle SoftIRQ processing for this - * group of packets. This policy scans the next several cores - * in order after this, trying to find one that is not - * already busy with SoftIRQ processing, and that doesn't appear - * to be active with NAPI/GRO processing either. If there - * is no such core, just rotate among the next cores. - */ - int i; - int this_core = raw_smp_processor_id(); - int candidate = this_core; - __u64 now = get_cycles(); - struct homa_core *core; - for (i = CORES_TO_CHECK; i > 0; i--) { - candidate++; - if (unlikely(candidate >= nr_cpu_ids)) - candidate = 0; - core = homa_cores[candidate]; - if (atomic_read(&core->softirq_backlog) > 0) - continue; - if ((core->last_gro + homa->gro_busy_cycles) > now) - continue; - tt_record3("homa_gro_complete chose core %d for id %d " - "offset %d with IDLE_NEW policy", - candidate, homa_local_id(h->sender_id), - ntohl(d->seg.offset)); - break; - } - if (i <= 0) { - /* All of the candidates appear to be busy; just - * rotate among them. - */ - int offset = homa_cores[this_core]->softirq_offset; - offset += 1; - if (offset > CORES_TO_CHECK) - offset = 1; - homa_cores[this_core]->softirq_offset = offset; - candidate = this_core + offset; - while (candidate >= nr_cpu_ids) { - candidate -= nr_cpu_ids; - } - tt_record3("homa_gro_complete chose core %d for id %d " - "offset %d with IDLE_NEW policy " - "(all cores busy)", - candidate, homa_local_id(h->sender_id), - ntohl(d->seg.offset)); - } - atomic_inc(&homa_cores[candidate]->softirq_backlog); - homa_cores[this_core]->last_gro = now; - homa_set_softirq_cpu(skb, candidate); + // tt_record4("homa_gro_complete type %d, id %d, offset %d, count %d", + // h->common.type, homa_local_id(h->common.sender_id), + // ntohl(h->seg.offset), + // NAPI_GRO_CB(skb)->count); + + per_cpu(homa_offload_core, smp_processor_id()).held_skb = NULL; + if (homa->gro_policy & HOMA_GRO_GEN3) { + homa_gro_gen3(homa, skb); + } else if (homa->gro_policy & HOMA_GRO_GEN2) { + homa_gro_gen2(homa, skb); } else if (homa->gro_policy & HOMA_GRO_IDLE) { int i, core, best; - __u64 best_time = ~0; - __u64 last_active; + u64 best_time = ~0; + u64 last_active; /* Pick a specific core to handle SoftIRQ processing for this * group of packets. The goal here is to spread load so that no @@ -375,34 +635,34 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset) * hasn't done NAPI or SoftIRQ processing for Homa in the * longest time. */ - core = best = raw_smp_processor_id(); + best = smp_processor_id(); + core = best; for (i = 0; i < CORES_TO_CHECK; i++) { core++; if (unlikely(core >= nr_cpu_ids)) core = 0; - last_active = homa_cores[core]->last_active; + last_active = per_cpu(homa_offload_core, core).last_active; if (last_active < best_time) { best_time = last_active; best = core; } } homa_set_softirq_cpu(skb, best); - tt_record3("homa_gro_complete chose core %d for id %d " - "offset %d with IDLE policy", - best, homa_local_id(h->sender_id), - ntohl(d->seg.offset)); + tt_record3("homa_gro_complete chose core %d for id %d offset %d with IDLE policy", + best, homa_local_id(h->common.sender_id), + ntohl(h->seg.offset)); } else if (homa->gro_policy & HOMA_GRO_NEXT) { /* Use the next core (in circular order) to handle the * SoftIRQ processing. */ - int target = raw_smp_processor_id() + 1; + int target = smp_processor_id() + 1; + if (unlikely(target >= nr_cpu_ids)) target = 0; homa_set_softirq_cpu(skb, target); - tt_record3("homa_gro_complete chose core %d for id %d " - "offset %d with NEXT policy", - target, homa_local_id(h->sender_id), - ntohl(d->seg.offset)); + tt_record3("homa_gro_complete chose core %d for id %d offset %d with NEXT policy", + target, homa_local_id(h->common.sender_id), + ntohl(h->seg.offset)); } return 0; diff --git a/homa_offload.h b/homa_offload.h new file mode 100644 index 00000000..936230e2 --- /dev/null +++ b/homa_offload.h @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ + +/* This file contains definitions related to homa_offload.c. */ + +#ifndef _HOMA_OFFLOAD_H +#define _HOMA_OFFLOAD_H + +#include + +/** + * struct homa_offload_core - Stores core-specific information used during + * GRO operations. + */ +struct homa_offload_core { + /** + * @last_active: homa_clock() time of the last known activity + * on this core, such NAPI or SoftIRQ. Used for load balancing. + */ + u64 last_active; + + /** + * @last_gro: the most recent homa_clock() time when + * homa_gro_receive returned on this core. Used to determine + * whether GRO is keeping a core busy. + */ + u64 last_gro; + + /** + * @softirq_backlog: the number of batches of packets that have + * been queued for SoftIRQ processing on this core but haven't + * yet been processed. + */ + atomic_t softirq_backlog; + + /** + * @softirq_offset: used when rotating SoftIRQ assignment among + * the next cores; contains an offset to add to the current core + * to produce the core for SoftIRQ. + */ + int softirq_offset; + + /** + * @gen3_softirq_cores: when the Gen3 load balancer is in use, + * GRO will arrange for SoftIRQ processing to occur on one of + * these cores; -1 values are ignored (see balance.txt for more + * on lewd balancing). This information is filled in via sysctl. + */ +#define NUM_GEN3_SOFTIRQ_CORES 3 + int gen3_softirq_cores[NUM_GEN3_SOFTIRQ_CORES]; + + /** + * @last_app_active: the most recent homa_clock() time when an + * application was actively using Homa on this core (e.g., by + * sending or receiving messages). Used for load balancing + * (see balance.txt). + */ + u64 last_app_active; + + /** + * @held_skb: last packet buffer known to be available for + * merging other packets into on this core (note: may not still + * be available), or NULL if none. + */ + struct sk_buff *held_skb; + + /** + * @held_bucket: the index, within napi->gro_hash, of the list + * containing @held_skb; undefined if @held_skb is NULL. Used to + * verify that @held_skb is still available. + */ + int held_bucket; +}; +DECLARE_PER_CPU(struct homa_offload_core, homa_offload_core); + +int homa_gro_complete(struct sk_buff *skb, int thoff); +void homa_gro_gen2(struct homa *homa, struct sk_buff *skb); +void homa_gro_gen3(struct homa *homa, struct sk_buff *skb); +#ifndef __STRIP__ /* See strip.py */ +void homa_gro_hook_tcp(void); +void homa_gro_unhook_tcp(void); +#endif /* See strip.py */ +struct sk_buff *homa_gro_receive(struct list_head *gro_list, + struct sk_buff *skb); +struct sk_buff *homa_gso_segment(struct sk_buff *skb, + netdev_features_t features); +int homa_offload_end(void); +int homa_offload_init(void); +void homa_send_ipis(void); +#ifndef __STRIP__ /* See strip.py */ +struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, + struct sk_buff *skb); +#endif /* See strip.py */ + +#endif /* _HOMA_OFFLOAD_H */ diff --git a/homa_outgoing.c b/homa_outgoing.c index 4bd56608..12cce77a 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -1,46 +1,232 @@ -/* Copyright (c) 2019-2023 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file contains functions related to the sender side of message * transmission. It also contains utility functions for sending packets. */ #include "homa_impl.h" +#include "homa_peer.h" +#include "homa_rpc.h" +#include "homa_wire.h" + +#ifndef __STRIP__ /* See strip.py */ +#include "homa_pacer.h" +#include "homa_qdisc.h" +#include "homa_skb.h" +#else /* See strip.py */ +#include "homa_stub.h" +#endif /* See strip.py */ + +/** + * homa_message_out_init() - Initialize rpc->msgout. + * @rpc: RPC whose output message should be initialized. Must be + * locked by caller. + * @length: Number of bytes that will eventually be in rpc->msgout. + */ +void homa_message_out_init(struct homa_rpc *rpc, int length) + __must_hold(rpc->bucket->lock) +{ + memset(&rpc->msgout, 0, sizeof(rpc->msgout)); + rpc->msgout.length = length; + rpc->msgout.next_xmit = &rpc->msgout.packets; +#ifndef __STRIP__ /* See strip.py */ + rpc->msgout.unscheduled = rpc->hsk->homa->unsched_bytes; + if (rpc->msgout.unscheduled > length) + rpc->msgout.unscheduled = length; +#endif /* See strip.py */ + rpc->msgout.init_time = homa_clock(); +} +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_fill_data_interleaved() - This function is invoked to fill in the + * part of a data packet after the initial header, when GSO is being used + * but TCP hijacking is not. As result, homa_seg_hdrs must be interleaved + * with the data to provide the correct offset for each segment. + * @rpc: RPC whose output message is being created. Must be + * locked by caller. + * @skb: The packet being filled. The initial homa_data_hdr was + * created and initialized by the caller and the + * homa_skb_info has been filled in with the packet geometry. + * @iter: Describes location(s) of (remaining) message data in user + * space. + * Return: Either a negative errno or 0 (for success). + */ +#else /* See strip.py */ /** - * set_priority() - Arrange for an outgoing packet to have a particular - * priority level. - * @skb: The packet was priority should be set. - * @hsk: Socket on which the packet will be sent. - * @priority: Priority level for the packet; must be less than - * HOMA_MAX_PRIORITIES. + * homa_fill_data_interleaved() - This function is invoked to fill in the + * part of a data packet after the initial header, when GSO is being used. + * homa_seg_hdrs must be interleaved with the data to provide the correct + * offset for each segment. + * @rpc: RPC whose output message is being created. Must be + * locked by caller. + * @skb: The packet being filled. The initial homa_data_hdr was + * created and initialized by the caller and the + * homa_skb_info has been filled in with the packet geometry. + * @iter: Describes location(s) of (remaining) message data in user + * space. + * Return: Either a negative errno or 0 (for success). */ -inline static void set_priority(struct sk_buff *skb, struct homa_sock *hsk, - int priority) +#endif /* See strip.py */ +int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, + struct iov_iter *iter) + __must_hold(rpc->bucket->lock) { - /* Note: this code initially specified the priority in the VLAN - * header, but as of 3/2020, this performed badly on the CloudLab - * cluster being used for testing: 100 us of extra delay occurred - * whenever a packet's VLAN priority differed from the previous - * packet. So, now we use the DSCP field in the IP header instead. + struct homa_skb_info *homa_info = homa_get_skb_info(skb); + int seg_length = homa_info->seg_length; + int bytes_left = homa_info->data_bytes; + int offset = homa_info->offset; + int err; + + /* Each iteration of the following loop adds info for one packet, + * which includes a homa_seg_hdr followed by the data for that + * segment. The first homa_seg_hdr was already added by the caller. + */ + while (1) { + struct homa_seg_hdr seg; + + if (bytes_left < seg_length) + seg_length = bytes_left; + err = homa_skb_append_from_iter(rpc->hsk->homa, skb, iter, + seg_length); + if (err != 0) + return err; + bytes_left -= seg_length; + offset += seg_length; + + if (bytes_left == 0) + break; + + seg.offset = htonl(offset); + err = homa_skb_append_to_frag(rpc->hsk->homa, skb, &seg, + sizeof(seg)); + if (err != 0) + return err; + } + return 0; +} + +/** + * homa_tx_data_pkt_alloc() - Allocate a new sk_buff and fill it with an + * outgoing Homa data packet. The resulting packet will be a GSO packet + * that will eventually be segmented by the NIC. + * @rpc: RPC that packet will belong to (msgout must have been + * initialized). Must be locked by caller. + * @iter: Describes location(s) of (remaining) message data in user + * space. + * @offset: Offset in the message of the first byte of data in this + * packet. + * @length: How many bytes of data to include in the skb. Caller must + * ensure that this amount of data isn't too much for a + * well-formed GSO packet, and that iter has at least this + * much data. + * @max_seg_data: Maximum number of bytes of message data that can go in + * a single segment of the GSO packet. + * Return: A pointer to the new packet, or a negative errno. Sets + * rpc->hsk->error_msg on errors. + */ +struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, + struct iov_iter *iter, int offset, + int length, int max_seg_data) + __must_hold(rpc->bucket->lock) +{ + struct homa_sock *hsk = rpc->hsk; + struct homa_skb_info *homa_info; + struct homa_data_hdr *h; + struct sk_buff *skb; + int err, gso_size; + u64 segs; + + segs = length + max_seg_data - 1; + do_div(segs, max_seg_data); + + /* Initialize the overall skb. */ +#ifndef __STRIP__ /* See strip.py */ + skb = homa_skb_alloc_tx(sizeof(struct homa_data_hdr)); +#else /* See strip.py */ + skb = homa_skb_alloc_tx(sizeof(struct homa_data_hdr) + length + + (segs - 1) * sizeof(struct homa_seg_hdr)); +#endif /* See strip.py */ + if (!skb) { + hsk->error_msg = "couldn't allocate sk_buff for outgoing message"; + return ERR_PTR(-ENOMEM); + } + + /* Fill in the Homa header (which will be replicated in every + * network packet by GSO). */ - hsk->inet.tos = hsk->homa->priority_map[priority]<<5; + h = (struct homa_data_hdr *)skb_put(skb, sizeof(struct homa_data_hdr)); + h->common.sport = htons(hsk->port); + h->common.dport = htons(rpc->dport); + h->common.sequence = htonl(offset); + h->common.type = DATA; + IF_NO_STRIP(homa_set_hijack(&h->common)); + homa_set_doff(h, sizeof(struct homa_data_hdr)); + h->common.checksum = 0; + h->common.sender_id = cpu_to_be64(rpc->id); + h->message_length = htonl(rpc->msgout.length); + IF_NO_STRIP(h->incoming = htonl(rpc->msgout.unscheduled)); + h->ack.client_id = 0; + homa_peer_get_acks(rpc->peer, 1, &h->ack); + IF_NO_STRIP(h->cutoff_version = rpc->peer->cutoff_version); + h->retransmit = 0; +#ifndef __STRIP__ /* See strip.py */ + h->seg.offset = htonl(-1); +#else /* See strip.py */ + h->seg.offset = htonl(offset); +#endif /* See strip.py */ + + homa_info = homa_get_skb_info(skb); + homa_info->next_skb = NULL; + homa_info->wire_bytes = length + segs * (sizeof(struct homa_data_hdr) + + hsk->ip_header_length + HOMA_ETH_OVERHEAD); + homa_info->data_bytes = length; + homa_info->seg_length = max_seg_data; + homa_info->offset = offset; + homa_info->rpc = rpc; + +#ifndef __STRIP__ /* See strip.py */ + if (segs > 1 && hsk->sock.sk_protocol != IPPROTO_TCP) { +#else /* See strip.py */ + if (segs > 1) { +#endif /* See strip.py */ + homa_set_doff(h, sizeof(struct homa_data_hdr) - + sizeof(struct homa_seg_hdr)); +#ifndef __STRIP__ /* See strip.py */ + h->seg.offset = htonl(offset); +#endif /* See strip.py */ + gso_size = max_seg_data + sizeof(struct homa_seg_hdr); + err = homa_fill_data_interleaved(rpc, skb, iter); + } else { + gso_size = max_seg_data; + err = homa_skb_append_from_iter(hsk->homa, skb, iter, length); + } + if (err) { + hsk->error_msg = "couldn't copy message body into packet buffers"; + goto error; + } + + if (segs > 1) { + skb_shinfo(skb)->gso_segs = segs; + skb_shinfo(skb)->gso_size = gso_size; + + /* It's unclear what gso_type should be used to force software + * GSO; the value below seems to work... + */ + skb_shinfo(skb)->gso_type = + hsk->homa->gso_force_software ? 0xd : + (hsk->inet.sk.sk_family == AF_INET6) ? SKB_GSO_TCPV6 : + SKB_GSO_TCPV4; + } + return skb; + +error: + homa_skb_free_tx(hsk->homa, skb); + return ERR_PTR(err); } /** - * homa_message_out_init() - Initializes information for sending a message + * homa_message_out_fill() - Initializes information for sending a message * for an RPC (either request or response); copies the message data from * user space and (possibly) begins transmitting the message. * @rpc: RPC for which to send message; this function must not @@ -49,197 +235,173 @@ inline static void set_priority(struct sk_buff *skb, struct homa_sock *hsk, * before returning. * @iter: Describes location(s) of message data in user space. * @xmit: Nonzero means this method should start transmitting packets; - * zero means the caller will initiate transmission. + * transmission will be overlapped with copying from user space. + * Zero means the caller will initiate transmission after this + * function returns. * - * Return: 0 for success, or a negative errno for failure. + * Return: 0 for success, or a negative errno for failure. It is possible + * for the RPC to be freed while this function is active. If that + * happens, copying will cease, -EINVAL will be returned, and + * rpc->state will be RPC_DEAD. Sets rpc->hsk->error_msg on errors. */ -int homa_message_out_init(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) +int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) + __must_hold(rpc->bucket->lock) { /* Geometry information for packets: * mtu: largest size for an on-the-wire packet (including * all headers through IP header, but not Ethernet * header). - * max_pkt_data: largest amount of Homa message data that - * fits in an on-the-wire packet. - * gso_size: space required in each sk_buff (pre-GSO), starting - * with IP header. + * max_seg_data: largest amount of Homa message data that fits + * in an on-the-wire packet (after segmentation). + * max_gso_data: largest amount of Homa message data that fits + * in a GSO packet (before segmentation). */ - int mtu, max_pkt_data, gso_size; - int bytes_left; - int err; + int mtu, max_seg_data, max_gso_data; struct sk_buff **last_link; struct dst_entry *dst; - int overlap_xmit; - unsigned int gso_type; - - rpc->msgout.length = iter->count; - rpc->msgout.num_skbs = 0; - rpc->msgout.packets = NULL; - rpc->msgout.next_xmit = &rpc->msgout.packets; - rpc->msgout.next_xmit_offset = 0; - atomic_set(&rpc->msgout.active_xmits, 0); - rpc->msgout.sched_priority = 0; - rpc->msgout.init_cycles = get_cycles(); + u64 segs_per_gso; + /* Bytes of the message that haven't yet been copied into skbs. */ + int bytes_left; + int gso_size; + int err; - if (unlikely((rpc->msgout.length > HOMA_MAX_MESSAGE_LENGTH) - || (iter->count == 0))) { + if (unlikely(iter->count > HOMA_MAX_MESSAGE_LENGTH || + iter->count == 0)) { + rpc->hsk->error_msg = "message length exceeded HOMA_MAX_MESSAGE_LENGTH"; err = -EINVAL; goto error; } + homa_message_out_init(rpc, iter->count); - /* Compute the geometry of packets, both how they will end up on the - * wire and large they will be here (before GSO). - */ + /* Compute the geometry of packets. */ dst = homa_get_dst(rpc->peer, rpc->hsk); mtu = dst_mtu(dst); - max_pkt_data = mtu - rpc->hsk->ip_header_length - - sizeof(struct data_header); - - /* It's unclear what gso_type should be to force software GSO; the - * value below seems to work... + max_seg_data = mtu - rpc->hsk->ip_header_length + - sizeof(struct homa_data_hdr); + gso_size = dst->dev->gso_max_size; + if (gso_size > rpc->hsk->homa->max_gso_size) + gso_size = rpc->hsk->homa->max_gso_size; + dst_release(dst); + +#ifndef __STRIP__ /* See strip.py */ + /* Round gso_size down to an even # of mtus; calculation depends + * on whether we're doing TCP hijacking (need more space in TSO packet + * if no hijacking). */ - gso_type = (rpc->hsk->homa->gso_force_software) ? 0xd : SKB_GSO_TCPV6; - - if (rpc->msgout.length <= max_pkt_data) { - /* Message fits in a single packet: no need for GSO. */ - rpc->msgout.unscheduled = rpc->msgout.length; - rpc->msgout.gso_pkt_data = rpc->msgout.length; - gso_size = mtu; + if (rpc->hsk->sock.sk_protocol == IPPROTO_TCP) { + /* Hijacking */ + segs_per_gso = gso_size - rpc->hsk->ip_header_length + - sizeof(struct homa_data_hdr); + do_div(segs_per_gso, max_seg_data); } else { - /* Can use GSO to pass multiple network packets through the - * IP stack at once. - */ - int repl_length, pkts_per_gso; - - gso_size = rpc->peer->dst->dev->gso_max_size; - if (gso_size > rpc->hsk->homa->max_gso_size) - gso_size = rpc->hsk->homa->max_gso_size; - - /* Round gso_size down to an even # of mtus. */ - repl_length = rpc->hsk->ip_header_length - + sizeof32(struct data_header) - - sizeof32(struct data_segment); - pkts_per_gso = (gso_size - repl_length)/(mtu - repl_length); - if (pkts_per_gso == 0) - pkts_per_gso = 1; - rpc->msgout.gso_pkt_data = pkts_per_gso * max_pkt_data; - gso_size = repl_length + (pkts_per_gso * (mtu - repl_length)); - - /* Round unscheduled bytes *up* to an even number of gsos. */ - rpc->msgout.unscheduled = rpc->hsk->homa->rtt_bytes - + rpc->msgout.gso_pkt_data - 1; - rpc->msgout.unscheduled -= rpc->msgout.unscheduled - % rpc->msgout.gso_pkt_data; - if (rpc->msgout.unscheduled > rpc->msgout.length) - rpc->msgout.unscheduled = rpc->msgout.length; + /* No hijacking */ + segs_per_gso = gso_size - rpc->hsk->ip_header_length - + sizeof(struct homa_data_hdr) + + sizeof(struct homa_seg_hdr); + do_div(segs_per_gso, max_seg_data + + sizeof(struct homa_seg_hdr)); } - UNIT_LOG("; ", "mtu %d, max_pkt_data %d, gso_size %d, gso_pkt_data %d", - mtu, max_pkt_data, gso_size, rpc->msgout.gso_pkt_data); - - overlap_xmit = rpc->msgout.length > 2*rpc->msgout.gso_pkt_data; +#else /* See strip.py */ + /* Round gso_size down to an even # of mtus. */ + segs_per_gso = gso_size - rpc->hsk->ip_header_length - + sizeof(struct homa_data_hdr) + + sizeof(struct homa_seg_hdr); + do_div(segs_per_gso, max_seg_data + + sizeof(struct homa_seg_hdr)); +#endif /* See strip.py */ + if (segs_per_gso == 0) + segs_per_gso = 1; + max_gso_data = segs_per_gso * max_seg_data; + UNIT_LOG("; ", "mtu %d, max_seg_data %d, max_gso_data %d", + mtu, max_seg_data, max_gso_data); + +#ifndef __STRIP__ /* See strip.py */ rpc->msgout.granted = rpc->msgout.unscheduled; - atomic_or(RPC_COPYING_FROM_USER, &rpc->flags); - - /* Copy message data from user space and form sk_buffs. Each - * iteration of the outer loop creates one sk_buff, which may - * contain info for multiple packets on the wire (via TSO or GSO). - */ - tt_record3("starting copy from user space for id %d, length %d, " - "unscheduled %d", - rpc->id, rpc->msgout.length, rpc->msgout.unscheduled); +#endif /* See strip.py */ + homa_skb_stash_pages(rpc->hsk->homa, rpc->msgout.length); + + /* Each iteration of the loop below creates one GSO packet. */ +#ifndef __STRIP__ /* See strip.py */ + tt_record3("starting copy from user space for id %d, length %d, unscheduled %d", + rpc->id, rpc->msgout.length, rpc->msgout.unscheduled); +#else /* See strip.py */ + tt_record2("starting copy from user space for id %d, length %d", + rpc->id, rpc->msgout.length); +#endif /* See strip.py */ last_link = &rpc->msgout.packets; for (bytes_left = rpc->msgout.length; bytes_left > 0; ) { - struct data_header *h; - struct data_segment *seg; - int available; + int skb_data_bytes, offset; struct sk_buff *skb; homa_rpc_unlock(rpc); - - skb = alloc_skb(HOMA_SKB_EXTRA + gso_size - + sizeof32(struct homa_skb_info), GFP_KERNEL); - if (unlikely(!skb)) { - err = -ENOMEM; + skb_data_bytes = max_gso_data; + offset = rpc->msgout.length - bytes_left; +#ifndef __STRIP__ /* See strip.py */ + if (offset < rpc->msgout.unscheduled && + (offset + skb_data_bytes) > rpc->msgout.unscheduled) { + /* Insert a packet boundary at the unscheduled limit, + * so we don't transmit extra data. + */ + skb_data_bytes = rpc->msgout.unscheduled - offset; + } +#endif /* See strip.py */ + if (skb_data_bytes > bytes_left) + skb_data_bytes = bytes_left; + skb = homa_tx_data_pkt_alloc(rpc, iter, offset, skb_data_bytes, + max_seg_data); + if (IS_ERR(skb)) { + err = PTR_ERR(skb); homa_rpc_lock(rpc); goto error; } - if ((bytes_left > max_pkt_data) - && (rpc->msgout.gso_pkt_data > max_pkt_data)) { - skb_shinfo(skb)->gso_size = sizeof(struct data_segment) - + max_pkt_data; - skb_shinfo(skb)->gso_type = gso_type; - } - skb_shinfo(skb)->gso_segs = 0; - - /* Fill in the initial portion (which will be replicated in - * every network packet by GSO/TSO). - */ - skb_reserve(skb, rpc->hsk->ip_header_length + HOMA_SKB_EXTRA); - skb_reset_transport_header(skb); - h = (struct data_header *) skb_put(skb, - sizeof(*h) - sizeof(struct data_segment)); - h->common.sport = htons(rpc->hsk->port); - h->common.dport = htons(rpc->dport); - homa_set_doff(h); - h->common.type = DATA; - h->common.sender_id = cpu_to_be64(rpc->id); - h->message_length = htonl(rpc->msgout.length); - h->incoming = htonl(rpc->msgout.unscheduled); - h->cutoff_version = rpc->peer->cutoff_version; - h->retransmit = 0; - homa_get_skb_info(skb)->wire_bytes = 0; - - available = rpc->msgout.gso_pkt_data; - - /* Each iteration of the following loop adds one segment - * (which will become a separate packet after GSO) to the buffer. - */ - do { - int seg_size; - seg = (struct data_segment *) skb_put(skb, sizeof(*seg)); - seg->offset = htonl(rpc->msgout.length - bytes_left); - if (bytes_left <= max_pkt_data) - seg_size = bytes_left; - else - seg_size = max_pkt_data; - seg->segment_length = htonl(seg_size); - seg->ack.client_id = 0; - homa_peer_get_acks(rpc->peer, 1, &seg->ack); - if (copy_from_iter(skb_put(skb, seg_size), seg_size, - iter) != seg_size) { - err = -EFAULT; - kfree_skb(skb); - homa_rpc_lock(rpc); - goto error; - } - bytes_left -= seg_size; - (skb_shinfo(skb)->gso_segs)++; - available -= seg_size; - homa_get_skb_info(skb)->wire_bytes += mtu - - (max_pkt_data - seg_size) - + HOMA_ETH_OVERHEAD; - } while ((available > 0) && (bytes_left > 0)); + bytes_left -= skb_data_bytes; homa_rpc_lock(rpc); + if (rpc->state == RPC_DEAD) { + /* RPC was freed while we were copying. */ + rpc->hsk->error_msg = "rpc deleted while creating outgoing message"; + err = -EINVAL; + homa_skb_free_tx(rpc->hsk->homa, skb); + goto error; + } *last_link = skb; last_link = &(homa_get_skb_info(skb)->next_skb); *last_link = NULL; rpc->msgout.num_skbs++; - if (overlap_xmit && list_empty(&rpc->throttled_links) && xmit) { - tt_record1("waking up pacer for id %d", rpc->id); - homa_add_to_throttled(rpc); - } + rpc->msgout.skb_memory += skb->truesize; + rpc->msgout.copied_from_user = rpc->msgout.length - bytes_left; + rpc->msgout.first_not_tx = rpc->msgout.packets; +#ifndef __STRIP__ /* See strip.py */ + /* The code below improves pipelining for long messages + * by overlapping transmission with copying from user space. + * This is a bit tricky because sending the packets takes + * a significant amount time. On high-speed networks (e.g. + * 100 Gbps and above), copying from user space is the + * bottleneck, so transmitting the packets here will slow + * that down. Thus, we only transmit the unscheduled packets + * here, to fill the pipe. Packets after that can be + * transmitted by SoftIRQ in response to incoming grants; + * this allows us to use two cores: this core copying data + * and the SoftIRQ core sending packets. + */ + if (offset < rpc->msgout.unscheduled && xmit) + homa_xmit_data(rpc, false); +#endif /* See strip.py */ } tt_record2("finished copy from user space for id %d, length %d", - rpc->id, rpc->msgout.length); - atomic_andnot(RPC_COPYING_FROM_USER, &rpc->flags); + rpc->id, rpc->msgout.length); INC_METRIC(sent_msg_bytes, rpc->msgout.length); - if (!overlap_xmit && xmit) + refcount_add(rpc->msgout.skb_memory, &rpc->hsk->sock.sk_wmem_alloc); + if (xmit) +#ifndef __STRIP__ /* See strip.py */ homa_xmit_data(rpc, false); +#else /* See strip.py */ + homa_xmit_data(rpc); +#endif /* See strip.py */ return 0; - error: - atomic_andnot(RPC_COPYING_FROM_USER, &rpc->flags); +error: + refcount_add(rpc->msgout.skb_memory, &rpc->hsk->sock.sk_wmem_alloc); return err; } @@ -252,18 +414,21 @@ int homa_message_out_init(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) * @length: Length of @contents (including the common header). * @rpc: The packet will go to the socket that handles the other end * of this RPC. Addressing info for the packet, including all of - * the fields of common_header except type, will be set from this. + * the fields of homa_common_hdr except type, will be set from this. + * Caller must hold either the lock or a reference. * * Return: Either zero (for success), or a negative errno value if there * was a problem. */ int homa_xmit_control(enum homa_packet_type type, void *contents, - size_t length, struct homa_rpc *rpc) + size_t length, struct homa_rpc *rpc) { - struct common_header *h = (struct common_header *) contents; + struct homa_common_hdr *h = contents; + h->type = type; h->sport = htons(rpc->hsk->port); h->dport = htons(rpc->dport); + IF_NO_STRIP(homa_set_hijack(h)); h->sender_id = cpu_to_be64(rpc->id); return __homa_xmit_control(contents, length, rpc->peer, rpc->hsk); } @@ -282,70 +447,61 @@ int homa_xmit_control(enum homa_packet_type type, void *contents, * was a problem. */ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, - struct homa_sock *hsk) + struct homa_sock *hsk) { - struct common_header *h; - int extra_bytes; - int result, priority; - struct dst_entry *dst; + struct homa_common_hdr *h; struct sk_buff *skb; + int extra_bytes; + int result; - /* Allocate the same size sk_buffs as for the smallest data - * packets (better reuse of sk_buffs?). - */ - dst = homa_get_dst(peer, hsk); - skb = alloc_skb(dst_mtu(dst) + HOMA_SKB_EXTRA + sizeof32(void*), - GFP_KERNEL); + IF_NO_STRIP(int priority); + + skb = homa_skb_alloc_tx(HOMA_MAX_HEADER); if (unlikely(!skb)) return -ENOBUFS; - dst_hold(dst); - skb_dst_set(skb, dst); + skb_dst_set(skb, homa_get_dst(peer, hsk)); - skb_reserve(skb, hsk->ip_header_length + HOMA_SKB_EXTRA); - skb_reset_transport_header(skb); - h = (struct common_header *) skb_put(skb, length); + h = skb_put(skb, length); memcpy(h, contents, length); extra_bytes = HOMA_MIN_PKT_LENGTH - length; if (extra_bytes > 0) { memset(skb_put(skb, extra_bytes), 0, extra_bytes); UNIT_LOG(",", "padded control packet with %d bytes", - extra_bytes); + extra_bytes); } - priority = hsk->homa->num_priorities-1; +#ifndef __STRIP__ /* See strip.py */ + priority = hsk->homa->num_priorities - 1; +#endif /* See strip.py */ skb->ooo_okay = 1; - skb_get(skb); +#ifndef __STRIP__ /* See strip.py */ if (hsk->inet.sk.sk_family == AF_INET6) { result = ip6_xmit(&hsk->inet.sk, skb, &peer->flow.u.ip6, 0, - NULL, hsk->homa->priority_map[priority] << 4, 0); + NULL, hsk->homa->priority_map[priority] << 5, + 0); } else { /* This will find its way to the DSCP field in the IPv4 hdr. */ - hsk->inet.tos = hsk->homa->priority_map[priority]<<5; + hsk->inet.tos = hsk->homa->priority_map[priority] << 5; result = ip_queue_xmit(&hsk->inet.sk, skb, &peer->flow); } - if (unlikely(result != 0)) { + if (unlikely(result != 0)) INC_METRIC(control_xmit_errors, 1); - - /* It appears that ip*_xmit frees skbuffs after - * errors; the following code is to raise an alert if - * this isn't actually the case. The extra skb_get above - * and kfree_skb below are needed to do the check - * accurately (otherwise the buffer could be freed and - * its memory used for some other purpose, resulting in - * a bogus "reference count"). - */ - if (refcount_read(&skb->users) > 1) { - if (hsk->inet.sk.sk_family == AF_INET6) { - printk(KERN_NOTICE "ip6_xmit didn't free " - "Homa control packet after " - "error\n"); - } else { - printk(KERN_NOTICE "ip_queue_xmit didn't free " - "Homa control packet after " - "error\n"); - } - } + if (skb->dev) { + struct netdev_queue *txq; + + txq = netdev_get_tx_queue(skb->dev, skb->queue_mapping); + if (netif_tx_queue_stopped(txq)) + tt_record4("__homa_xmit_control found stopped txq for id %d, qid %u, num_queued %u, limit %d", + be64_to_cpu(h->sender_id), + skb->queue_mapping, + txq->dql.num_queued, txq->dql.adj_limit); } - kfree_skb(skb); +#else /* See strip.py */ + if (hsk->inet.sk.sk_family == AF_INET6) + result = ip6_xmit(&hsk->inet.sk, skb, &peer->flow.u.ip6, 0, + NULL, 0, 0); + else + result = ip_queue_xmit(&hsk->inet.sk, skb, &peer->flow); +#endif /* See strip.py */ INC_METRIC(packets_sent[h->type - DATA], 1); INC_METRIC(priority_bytes[priority], skb->len); INC_METRIC(priority_packets[priority], 1); @@ -353,35 +509,39 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, } /** - * homa_xmit_unknown() - Send an UNKNOWN packet to a peer. + * homa_xmit_unknown() - Send an RPC_UNKNOWN packet to a peer. * @skb: Buffer containing an incoming packet; identifies the peer to - * which the UNKNOWN packet should be sent. - * @hsk: Socket that should be used to send the UNKNOWN packet. + * which the RPC_UNKNOWN packet should be sent. + * @hsk: Socket that should be used to send the RPC_UNKNOWN packet. */ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) { - struct common_header *h = (struct common_header *) skb->data; - struct unknown_header unknown; - struct homa_peer *peer; + struct homa_common_hdr *h = (struct homa_common_hdr *)skb->data; struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); + struct homa_rpc_unknown_hdr unknown; + struct homa_peer *peer; +#ifndef __STRIP__ /* See strip.py */ if (hsk->homa->verbose) - printk(KERN_NOTICE "sending UNKNOWN to peer " - "%s:%d for id %llu", - homa_print_ipv6_addr(&saddr), - ntohs(h->sport), homa_local_id(h->sender_id)); + pr_notice("sending RPC_UNKNOWN to peer %s:%d for id %llu", + homa_print_ipv6_addr(&saddr), + ntohs(h->sport), homa_local_id(h->sender_id)); +#endif /* See strip.py */ tt_record3("sending unknown to 0x%x:%d for id %llu", - tt_addr(saddr), ntohs(h->sport), - homa_local_id(h->sender_id)); + tt_addr(saddr), ntohs(h->sport), + homa_local_id(h->sender_id)); unknown.common.sport = h->dport; unknown.common.dport = h->sport; + unknown.common.type = RPC_UNKNOWN; + IF_NO_STRIP(homa_set_hijack(&unknown.common)); unknown.common.sender_id = cpu_to_be64(homa_local_id(h->sender_id)); - unknown.common.type = UNKNOWN; - peer = homa_peer_find(&hsk->homa->peers, &saddr, &hsk->inet); + peer = homa_peer_get(hsk, &saddr); if (!IS_ERR(peer)) - __homa_xmit_control(&unknown, sizeof(unknown), peer, hsk); + __homa_xmit_control(&unknown, sizeof(unknown), peer, hsk); + homa_peer_release(peer); } +#ifndef __STRIP__ /* See strip.py */ /** * homa_xmit_data() - If an RPC has outbound data packets that are permitted * to be transmitted according to the scheduling mechanism, arrange for @@ -390,58 +550,100 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) * @rpc: RPC to check for transmittable packets. Must be locked by * caller. Note: this function will release the RPC lock while * passing packets through the RPC stack, then reacquire it - * before returning. + * before returning. It is possible that the RPC gets terminated + * when the lock isn't held, in which case the state will + * be RPC_DEAD on return. * @force: True means send at least one packet, even if the NIC queue * is too long. False means that zero packets may be sent, if * the NIC queue is sufficiently long. */ void homa_xmit_data(struct homa_rpc *rpc, bool force) +#else /* See strip.py */ +/** + * homa_xmit_data() - If an RPC has outbound data packets that are permitted + * to be transmitted according to the scheduling mechanism, arrange for + * them to be sent. + * @rpc: RPC to check for transmittable packets. Must be locked by + * caller. Note: this function will release the RPC lock while + * passing packets through the RPC stack, then reacquire it + * before returning. It is possible that the RPC gets terminated + * when the lock isn't held, in which case the state will + * be RPC_DEAD on return. + */ +void homa_xmit_data(struct homa_rpc *rpc) +#endif /* See strip.py */ + __must_hold(rpc->bucket->lock) { - struct homa *homa = rpc->hsk->homa; + int length; - atomic_inc(&rpc->msgout.active_xmits); - while (*rpc->msgout.next_xmit) { - int priority; + IF_NO_STRIP(struct homa *homa = rpc->hsk->homa); + IF_NO_STRIP(struct netdev_queue *txq); + + while (*rpc->msgout.next_xmit && rpc->state != RPC_DEAD) { struct sk_buff *skb = *rpc->msgout.next_xmit; + IF_NO_STRIP(int priority); + +#ifndef __STRIP__ /* See strip.py */ if (rpc->msgout.next_xmit_offset >= rpc->msgout.granted) { - tt_record3("homa_xmit_data stopping at offset %d " - "for id %u: granted is %d", - rpc->msgout.next_xmit_offset, rpc->id, - rpc->msgout.granted); + tt_record3("homa_xmit_data stopping at offset %d for id %u: granted is %d", + rpc->msgout.next_xmit_offset, rpc->id, + rpc->msgout.granted); break; } - if ((rpc->msgout.length - rpc->msgout.next_xmit_offset) - >= homa->throttle_min_bytes) { - if (!homa_check_nic_queue(homa, skb, force)) { - tt_record1("homa_xmit_data adding id %u to " - "throttle queue", rpc->id); - homa_add_to_throttled(rpc); + if (rpc->msgout.length - rpc->msgout.next_xmit_offset > + homa->qshared->defer_min_bytes && + !homa_qdisc_active(rpc->hsk->homa)) { + if (!homa_pacer_check_nic_q(homa->pacer, skb, force)) { + tt_record1("homa_xmit_data adding id %u to throttle queue", + rpc->id); + homa_pacer_manage_rpc(rpc); break; } } - if (rpc->msgout.next_xmit_offset < rpc->msgout.unscheduled) { + if (rpc->msgout.next_xmit_offset < rpc->msgout.unscheduled) priority = homa_unsched_priority(homa, rpc->peer, - rpc->msgout.length); - } else { + rpc->msgout.length); + else priority = rpc->msgout.sched_priority; - } +#endif /* See strip.py */ rpc->msgout.next_xmit = &(homa_get_skb_info(skb)->next_skb); - rpc->msgout.next_xmit_offset += rpc->msgout.gso_pkt_data; - if (rpc->msgout.next_xmit_offset > rpc->msgout.length) - rpc->msgout.next_xmit_offset = rpc->msgout.length; + length = homa_get_skb_info(skb)->data_bytes; + rpc->msgout.next_xmit_offset += length; +#ifndef __STRIP__ /* See strip.py */ + if (homa_is_client(rpc->id)) { + INC_METRIC(client_request_bytes_done, length); + INC_METRIC(client_requests_done, + rpc->msgout.next_xmit_offset == + rpc->msgout.length); + } else { + INC_METRIC(server_response_bytes_done, length); + INC_METRIC(server_responses_done, + rpc->msgout.next_xmit_offset == + rpc->msgout.length); + } +#endif /* See strip.py */ homa_rpc_unlock(rpc); skb_get(skb); +#ifndef __STRIP__ /* See strip.py */ __homa_xmit_data(skb, rpc, priority); + txq = netdev_get_tx_queue(skb->dev, skb->queue_mapping); + if (netif_tx_queue_stopped(txq)) + tt_record4("homa_xmit_data found stopped txq for id %d, qid %d, num_queued %d, limit %d", + rpc->id, skb->queue_mapping, + txq->dql.num_queued, txq->dql.adj_limit); force = false; +#else /* See strip.py */ + __homa_xmit_data(skb, rpc); +#endif /* See strip.py */ homa_rpc_lock(rpc); } - atomic_dec(&rpc->msgout.active_xmits); } +#ifndef __STRIP__ /* See strip.py */ /** * __homa_xmit_data() - Handles packet transmission stuff that is common * to homa_xmit_data and homa_resend_data. @@ -451,507 +653,231 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) * @priority: Priority level at which to transmit the packet. */ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority) +#else /* See strip.py */ +/** + * __homa_xmit_data() - Handles packet transmission stuff that is common + * to homa_xmit_data and homa_resend_data. + * @skb: Packet to be sent. The packet will be freed after transmission + * (and also if errors prevented transmission). + * @rpc: Information about the RPC that the packet belongs to. + */ +void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc) +#endif /* See strip.py */ { +#ifndef __STRIP__ /* See strip.py */ int err; - struct data_header *h = (struct data_header *) - skb_transport_header(skb); - struct dst_entry *dst; /* Update info that may have changed since the message was initially * created. */ - h->cutoff_version = rpc->peer->cutoff_version; + ((struct homa_data_hdr *)skb_transport_header(skb))->cutoff_version = + rpc->peer->cutoff_version; +#endif /* See strip.py */ - dst = homa_get_dst(rpc->peer, rpc->hsk); - dst_hold(dst); - skb_dst_set(skb, dst); + skb_dst_set(skb, homa_get_dst(rpc->peer, rpc->hsk)); skb->ooo_okay = 1; skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct common_header, checksum); + skb->csum_offset = offsetof(struct homa_common_hdr, checksum); if (rpc->hsk->inet.sk.sk_family == AF_INET6) { - tt_record4("calling ip6_xmit: wire_bytes %d, peer 0x%x, id %d, " - "offset %d", - homa_get_skb_info(skb)->wire_bytes, - tt_addr(rpc->peer->addr), rpc->id, - ntohl(h->seg.offset)); + tt_record4("calling ip6_xmit: wire_bytes %d, peer 0x%x, id %d, offset %d", + homa_get_skb_info(skb)->wire_bytes, + tt_addr(rpc->peer->addr), rpc->id, + homa_get_skb_info(skb)->offset); +#ifndef __STRIP__ /* See strip.py */ err = ip6_xmit(&rpc->hsk->inet.sk, skb, &rpc->peer->flow.u.ip6, - 0, NULL, - rpc->hsk->homa->priority_map[priority] << 4, 0); + 0, NULL, + rpc->hsk->homa->priority_map[priority] << 5, 0); +#else /* See strip.py */ + ip6_xmit(&rpc->hsk->inet.sk, skb, &rpc->peer->flow.u.ip6, + 0, NULL, 0, 0); +#endif /* See strip.py */ } else { - tt_record4("calling ip_queue_xmit: wire_bytes %d, peer 0x%x, " - "id %d, offset %d", - homa_get_skb_info(skb)->wire_bytes, - tt_addr(rpc->peer->addr), rpc->id, - htonl(h->seg.offset)); - - rpc->hsk->inet.tos = rpc->hsk->homa->priority_map[priority]<<5; + tt_record4("calling ip_queue_xmit: wire_bytes %d, peer 0x%x, id %d, offset %d", + homa_get_skb_info(skb)->wire_bytes, + tt_addr(rpc->peer->addr), rpc->id, + homa_get_skb_info(skb)->offset); + +#ifndef __STRIP__ /* See strip.py */ + rpc->hsk->inet.tos = + rpc->hsk->homa->priority_map[priority] << 5; err = ip_queue_xmit(&rpc->hsk->inet.sk, skb, &rpc->peer->flow); +#else /* See strip.py */ + ip_queue_xmit(&rpc->hsk->inet.sk, skb, &rpc->peer->flow); +#endif /* See strip.py */ } - tt_record4("Finished queueing packet: rpc id %llu, offset %d, len %d, " - "granted %d", - rpc->id, ntohl(h->seg.offset), skb->len, - rpc->msgout.granted); - if (err) { + tt_record4("Finished queueing packet: rpc id %llu, offset %d, len %d, qid %d", + rpc->id, homa_get_skb_info(skb)->offset, + homa_get_skb_info(skb)->data_bytes, skb->queue_mapping); +#ifndef __STRIP__ /* See strip.py */ + if (err) INC_METRIC(data_xmit_errors, 1); - } +#endif /* See strip.py */ INC_METRIC(packets_sent[0], 1); INC_METRIC(priority_bytes[priority], skb->len); INC_METRIC(priority_packets[priority], 1); } +#ifndef __STRIP__ /* See strip.py */ /** * homa_resend_data() - This function is invoked as part of handling RESEND - * requests. It retransmits the packets containing a given range of bytes + * requests. It retransmits the packet(s) containing a given range of bytes * from a message. - * @rpc: RPC for which data should be resent. + * @rpc: RPC for which data should be resent. Must be locked by caller. * @start: Offset within @rpc->msgout of the first byte to retransmit. * @end: Offset within @rpc->msgout of the byte just after the last one * to retransmit. * @priority: Priority level to use for the retransmitted data packets. */ void homa_resend_data(struct homa_rpc *rpc, int start, int end, - int priority) + int priority) +#else /* See strip.py */ +/** + * homa_resend_data() - This function is invoked as part of handling RESEND + * requests. It retransmits the packet(s) containing a given range of bytes + * from a message. + * @rpc: RPC for which data should be resent. + * @start: Offset within @rpc->msgout of the first byte to retransmit. + * @end: Offset within @rpc->msgout of the byte just after the last one + * to retransmit. + */ +void homa_resend_data(struct homa_rpc *rpc, int start, int end) +#endif /* See strip.py */ + __must_hold(rpc->bucket->lock) { + struct homa_skb_info *homa_info; struct sk_buff *skb; if (end <= start) return; - /* The nested loop below scans each data_segment in each - * packet, looking for those that overlap the range of - * interest. + /* Each iteration of this loop checks one packet in the message + * to see if it contains segments that need to be retransmitted. */ - for (skb = rpc->msgout.packets; skb != NULL; - skb = homa_get_skb_info(skb)->next_skb) { - int seg_offset = (skb_transport_header(skb) - skb->head) - + sizeof32(struct data_header) - - sizeof32(struct data_segment); - int offset, length, count; - struct data_segment *seg; - struct data_header *h; - - count = skb_shinfo(skb)->gso_segs; - if (count < 1) - count = 1; - for ( ; count > 0; count--, - seg_offset += sizeof32(*seg) + length) { + for (skb = rpc->msgout.packets; skb; skb = homa_info->next_skb) { + int seg_offset, offset, seg_length, data_left; + struct homa_data_hdr *h; + + homa_info = homa_get_skb_info(skb); + offset = homa_info->offset; + if (offset >= end) + break; + if (start >= (offset + homa_info->data_bytes)) + continue; + + offset = homa_info->offset; + seg_offset = sizeof(struct homa_data_hdr); + data_left = homa_info->data_bytes; + if (skb_shinfo(skb)->gso_segs <= 1) { + seg_length = data_left; + } else { + seg_length = homa_info->seg_length; + h = (struct homa_data_hdr *)skb_transport_header(skb); + } + for ( ; data_left > 0; data_left -= seg_length, + offset += seg_length, + seg_offset += skb_shinfo(skb)->gso_size) { + struct homa_skb_info *new_homa_info; struct sk_buff *new_skb; - seg = (struct data_segment *) (skb->head + seg_offset); - offset = ntohl(seg->offset); - length = ntohl(seg->segment_length); + int err; + + if (seg_length > data_left) + seg_length = data_left; if (end <= offset) goto resend_done; - if ((offset + length) <= start) + if ((offset + seg_length) <= start) continue; - /* This segment must be retransmitted. Copy it into - * a clean sk_buff. - */ - new_skb = alloc_skb(length + sizeof(struct data_header) - + rpc->hsk->ip_header_length - + HOMA_SKB_EXTRA, GFP_KERNEL); + /* This segment must be retransmitted. */ +#ifndef __STRIP__ /* See strip.py */ + new_skb = homa_skb_alloc_tx(sizeof(struct homa_data_hdr)); +#else /* See strip.py */ + new_skb = homa_skb_alloc_tx(sizeof(struct homa_data_hdr) + + seg_length); +#endif /* See strip.py */ if (unlikely(!new_skb)) { - if (rpc->hsk->homa->verbose) - printk(KERN_NOTICE "homa_resend_data " - "couldn't allocate skb\n"); - continue; + UNIT_LOG("; ", "skb allocation error"); + goto resend_done; } - skb_reserve(new_skb, rpc->hsk->ip_header_length - + HOMA_SKB_EXTRA); - skb_reset_transport_header(new_skb); - __skb_put_data(new_skb, skb_transport_header(skb), - sizeof32(struct data_header) - - sizeof32(struct data_segment)); - __skb_put_data(new_skb, seg, sizeof32(*seg) + length); - h = ((struct data_header *) skb_transport_header(new_skb)); + h = __skb_put_data(new_skb, skb_transport_header(skb), + sizeof(struct homa_data_hdr)); + h->common.sequence = htonl(offset); + h->seg.offset = htonl(offset); h->retransmit = 1; - if ((offset + length) <= rpc->msgout.granted) - h->incoming = htonl(rpc->msgout.granted); - else if ((offset + length) > rpc->msgout.length) - h->incoming = htonl(rpc->msgout.length); - else - h->incoming = htonl(offset + length); + IF_NO_STRIP(h->incoming = htonl(end)); + err = homa_skb_append_from_skb(rpc->hsk->homa, new_skb, + skb, seg_offset, + seg_length); + if (err != 0) { + pr_err("%s got error %d from homa_skb_append_from_skb\n", + __func__, err); + UNIT_LOG("; ", "%s got error %d while copying data", + __func__, -err); + kfree_skb(new_skb); + goto resend_done; + } + + new_homa_info = homa_get_skb_info(new_skb); + new_homa_info->next_skb = rpc->msgout.to_free; + new_homa_info->wire_bytes = rpc->hsk->ip_header_length + + sizeof(struct homa_data_hdr) + + seg_length + HOMA_ETH_OVERHEAD; + new_homa_info->data_bytes = seg_length; + new_homa_info->seg_length = seg_length; + new_homa_info->offset = offset; + new_homa_info->rpc = rpc; + + rpc->msgout.to_free = new_skb; + rpc->msgout.num_skbs++; + skb_get(new_skb); tt_record3("retransmitting offset %d, length %d, id %d", - offset, length, rpc->id); - homa_check_nic_queue(rpc->hsk->homa, new_skb, true); + offset, seg_length, rpc->id); +#ifndef __STRIP__ /* See strip.py */ + homa_pacer_check_nic_q(rpc->hsk->homa->pacer, new_skb, + true); __homa_xmit_data(new_skb, rpc, priority); +#else /* See strip.py */ + __homa_xmit_data(new_skb, rpc); +#endif /* See strip.py */ INC_METRIC(resent_packets, 1); } } resend_done: - /* Advance next_xmit past any packets that have now been completely - * transmitted (otherwise they'll get sent again, unnecessarily). - */ - while (true) { - struct sk_buff *skb_next; - int pkt_end; - - skb = *rpc->msgout.next_xmit; - if (skb == NULL) - break; - skb_next = homa_get_skb_info(skb)->next_skb; - if (skb_next != NULL) { - struct data_header *h = ((struct data_header *) - skb_transport_header(skb_next)); - pkt_end = ntohl(h->seg.offset); - } else { - pkt_end = rpc->msgout.length; - } - if (pkt_end > end) - break; - rpc->msgout.next_xmit = &(homa_get_skb_info(skb)->next_skb); - rpc->msgout.next_xmit_offset = pkt_end; - } -} - -/** - * homa_outgoing_sysctl_changed() - Invoked whenever a sysctl value is changed; - * any output-related parameters that depend on sysctl-settable values. - * @homa: Overall data about the Homa protocol implementation. - */ -void homa_outgoing_sysctl_changed(struct homa *homa) -{ - __u64 tmp; - - /* Code below is written carefully to avoid integer underflow or - * overflow under expected usage patterns. Be careful when changing! - */ - homa->cycles_per_kbyte = (8*(__u64) cpu_khz)/homa->link_mbps; - homa->cycles_per_kbyte = (101*homa->cycles_per_kbyte)/100; - tmp = homa->max_nic_queue_ns; - tmp = (tmp*cpu_khz)/1000000; - homa->max_nic_queue_cycles = tmp; -} - -/** - * homa_check_nic_queue() - This function is invoked before passing a packet - * to the NIC for transmission. It serves two purposes. First, it maintains - * an estimate of the NIC queue length. Second, it indicates to the caller - * whether the NIC queue is so full that no new packets should be queued - * (Homa's SRPT depends on keeping the NIC queue short). - * @homa: Overall data about the Homa protocol implementation. - * @skb: Packet that is about to be transmitted. - * @force: True means this packet is going to be transmitted - * regardless of the queue length. - * Return: Nonzero is returned if either the NIC queue length is - * acceptably short or @force was specified. 0 means that the - * NIC queue is at capacity or beyond, so the caller should delay - * the transmission of @skb. If nonzero is returned, then the - * queue estimate is updated to reflect the transmission of @skb. - */ -int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, bool force) -{ - __u64 idle, new_idle, clock; - int cycles_for_packet, bytes; - - bytes = homa_get_skb_info(skb)->wire_bytes; - cycles_for_packet = (bytes * homa->cycles_per_kbyte)/1000; - while (1) { - clock = get_cycles(); - idle = atomic64_read(&homa->link_idle_time); - if (((clock + homa->max_nic_queue_cycles) < idle) && !force - && !(homa->flags & HOMA_FLAG_DONT_THROTTLE)) - return 0; - if (!list_empty(&homa->throttled_rpcs)) - INC_METRIC(pacer_bytes, bytes); - if (idle < clock) { - if (homa->pacer_wake_time) { - __u64 lost = (homa->pacer_wake_time > idle) - ? clock - homa->pacer_wake_time - : clock - idle; - INC_METRIC(pacer_lost_cycles, lost); - tt_record1("pacer lost %d cycles", lost); - } - new_idle = clock + cycles_for_packet; - } else - new_idle = idle + cycles_for_packet; - - /* This method must be thread-safe. */ - if (atomic64_cmpxchg_relaxed(&homa->link_idle_time, idle, - new_idle) == idle) - break; - } - return 1; -} - -/** - * homa_pacer_main() - Top-level function for the pacer thread. - * @transportInfo: Pointer to struct homa. - * - * Return: Always 0. - */ -int homa_pacer_main(void *transportInfo) -{ - struct homa *homa = (struct homa *) transportInfo; - - homa->pacer_wake_time = get_cycles(); - while (1) { - if (homa->pacer_exit) { - homa->pacer_wake_time = 0; - break; - } - homa_pacer_xmit(homa); - - /* Sleep this thread if the throttled list is empty. Even - * if the throttled list isn't empty, call the scheduler - * to give other processes a chance to run (if we don't, - * softirq handlers can get locked out, which prevents - * incoming packets from being handled). - */ - set_current_state(TASK_INTERRUPTIBLE); - if (list_first_or_null_rcu(&homa->throttled_rpcs, - struct homa_rpc, throttled_links) == NULL) - tt_record("pacer sleeping"); - else - __set_current_state(TASK_RUNNING); - INC_METRIC(pacer_cycles, get_cycles() - homa->pacer_wake_time); - homa->pacer_wake_time = 0; - schedule(); - homa->pacer_wake_time = get_cycles(); - __set_current_state(TASK_RUNNING); - } - kthread_complete_and_exit(&homa_pacer_kthread_done, 0); - return 0; + return; } /** - * homa_pacer_xmit() - Transmit packets from the throttled list. Note: - * this function may be invoked from either process context or softirq (BH) - * level. This function is invoked from multiple places, not just in the - * pacer thread. The reason for this is that (as of 10/2019) Linux's scheduling - * of the pacer thread is unpredictable: the thread may block for long periods - * of time (e.g., because it is assigned to the same CPU as a busy interrupt - * handler). This can result in poor utilization of the network link. So, - * this method gets invoked from other places as well, to increase the - * likelihood that we keep the link busy. Those other invocations are not - * guaranteed to happen, so the pacer thread provides a backstop. - * @homa: Overall data about the Homa protocol implementation. + * homa_rpc_tx_end() - Return the offset of the first byte in an + * RPC's outgoing message that has not yet been fully transmitted. + * "Fully transmitted" means the message has been transmitted by the + * NIC and the skb has been released by the driver. This is different from + * rpc->msgout.next_xmit_offset, which computes the first offset that + * hasn't yet been passed to the IP stack. + * @rpc: RPC to check + * Return: See above. If the message has been fully transmitted then + * rpc->msgout.length is returned. */ -void homa_pacer_xmit(struct homa *homa) +int homa_rpc_tx_end(struct homa_rpc *rpc) { - struct homa_rpc *rpc; - int i; - - /* Make sure only one instance of this function executes at a - * time. - */ - if (!spin_trylock_bh(&homa->pacer_mutex)) - return; + struct sk_buff *skb = rpc->msgout.first_not_tx; - /* Each iteration through the following loop sends one packet. We - * limit the number of passes through this loop in order to cap the - * time spent in one call to this function (see note in - * homa_pacer_main about interfering with softirq handlers). - */ - for (i = 0; i < 5; i++) { - __u64 idle_time, now; - - /* If the NIC queue is too long, wait until it gets shorter. */ - now = get_cycles(); - idle_time = atomic64_read(&homa->link_idle_time); - while ((now + homa->max_nic_queue_cycles) < idle_time) { - /* If we've xmitted at least one packet then - * return (this helps with testing and also - * allows homa_pacer_main to yield the core). - */ - if (i != 0) - goto done; - now = get_cycles(); - } - /* Note: when we get here, it's possible that the NIC queue is - * still too long because other threads have queued packets, - * but we transmit anyway so we don't starve (see perf.text - * for more info). - */ + while (skb) { + struct homa_skb_info *homa_info = homa_get_skb_info(skb); - /* Lock the first throttled RPC. This may not be possible - * because we have to hold throttle_lock while locking - * the RPC; that means we can't wait for the RPC lock because - * of lock ordering constraints (see sync.txt). Thus, if - * the RPC lock isn't available, do nothing. Holding the - * throttle lock while locking the RPC is important because - * it keeps the RPC from being deleted before it can be locked. + /* next_xmit_offset tells us whether the packet has been + * passed to the IP stack. Checking the reference count tells + * us whether the packet has been released by the driver + * (which only happens after notification from the NIC that + * transmission is complete). */ - homa_throttle_lock(homa); - homa->pacer_fifo_count -= homa->pacer_fifo_fraction; - if (homa->pacer_fifo_count <= 0) { - __u64 oldest = ~0; - struct homa_rpc *cur; - - homa->pacer_fifo_count += 1000; - rpc = NULL; - list_for_each_entry_rcu(cur, &homa->throttled_rpcs, - throttled_links) { - if (cur->msgout.init_cycles < oldest) { - rpc = cur; - oldest = cur->msgout.init_cycles; - } - } - } else - rpc = list_first_or_null_rcu(&homa->throttled_rpcs, - struct homa_rpc, throttled_links); - if (rpc == NULL) { - homa_throttle_unlock(homa); - break; - } - if (!(spin_trylock_bh(rpc->lock))) { - homa_throttle_unlock(homa); - INC_METRIC(pacer_skipped_rpcs, 1); - break; - } - homa_throttle_unlock(homa); - - tt_record4("pacer calling homa_xmit_data for rpc id %llu, " - "port %d, offset %d, bytes_left %d", - rpc->id, rpc->hsk->port, - rpc->msgout.next_xmit_offset, - rpc->msgout.length - rpc->msgout.next_xmit_offset); - homa_xmit_data(rpc, true); - if (!*rpc->msgout.next_xmit || (rpc->msgout.next_xmit_offset - >= rpc->msgout.granted)) { - /* Nothing more to transmit from this message (right now), - * so remove it from the throttled list. - */ - homa_throttle_lock(homa); - if (!list_empty(&rpc->throttled_links)) { - tt_record2("pacer removing id %d from " - "throttled list, offset %d", - rpc->id, - rpc->msgout.next_xmit_offset); - list_del_rcu(&rpc->throttled_links); - if (list_empty(&homa->throttled_rpcs)) - INC_METRIC(throttled_cycles, get_cycles() - - homa->throttle_add); - - /* Note: this reinitialization is only safe - * because the pacer only looks at the first - * element of the list, rather than traversing - * it (and besides, we know the pacer isn't - * active concurrently, since this code *is* - * the pacer). It would not be safe under more - * general usage patterns. - */ - INIT_LIST_HEAD_RCU(&rpc->throttled_links); - } - homa_throttle_unlock(homa); - } - homa_rpc_unlock(rpc); - } - done: - spin_unlock_bh(&homa->pacer_mutex); -} - -/** - * homa_pacer_stop() - Will cause the pacer thread to exit (waking it up - * if necessary); doesn't return until after the pacer thread has exited. - * @homa: Overall data about the Homa protocol implementation. - */ -void homa_pacer_stop(struct homa *homa) -{ - homa->pacer_exit = true; - wake_up_process(homa->pacer_kthread); - kthread_stop(homa->pacer_kthread); - homa->pacer_kthread = NULL; -} - -/** - * homa_add_to_throttled() - Make sure that an RPC is on the throttled list - * and wake up the pacer thread if necessary. - * @rpc: RPC with outbound packets that have been granted but can't be - * sent because of NIC queue restrictions. - */ -void homa_add_to_throttled(struct homa_rpc *rpc) -{ - struct homa *homa = rpc->hsk->homa; - struct homa_rpc *candidate; - int bytes_left; - int checks = 0; - __u64 now; - - if (!list_empty(&rpc->throttled_links)) { - return; - } - now = get_cycles(); - if (!list_empty(&homa->throttled_rpcs)) - INC_METRIC(throttled_cycles, now - homa->throttle_add); - homa->throttle_add = now; - bytes_left = rpc->msgout.length - rpc->msgout.next_xmit_offset; - homa_throttle_lock(homa); - list_for_each_entry_rcu(candidate, &homa->throttled_rpcs, - throttled_links) { - int bytes_left_cand; - checks++; - - /* Watch out: the pacer might have just transmitted the last - * packet from candidate. - */ - bytes_left_cand = candidate->msgout.length - - candidate->msgout.next_xmit_offset; - if (bytes_left_cand > bytes_left) { - list_add_tail_rcu(&rpc->throttled_links, - &candidate->throttled_links); - goto done; - } - } - list_add_tail_rcu(&rpc->throttled_links, &homa->throttled_rpcs); -done: - homa_throttle_unlock(homa); - wake_up_process(homa->pacer_kthread); - INC_METRIC(throttle_list_adds, 1); - INC_METRIC(throttle_list_checks, checks); -// tt_record("woke up pacer thread"); -} - -/** - * homa_remove_from_throttled() - Make sure that an RPC is not on the - * throttled list. - * @rpc: RPC of interest. - */ -void homa_remove_from_throttled(struct homa_rpc *rpc) -{ - if (unlikely(!list_empty(&rpc->throttled_links))) { - UNIT_LOG("; ", "removing id %llu from throttled list", rpc->id); - homa_throttle_lock(rpc->hsk->homa); - list_del(&rpc->throttled_links); - if (list_empty(&rpc->hsk->homa->throttled_rpcs)) - INC_METRIC(throttled_cycles, get_cycles() - - rpc->hsk->homa->throttle_add); - homa_throttle_unlock(rpc->hsk->homa); - INIT_LIST_HEAD(&rpc->throttled_links); - } -} - -/** - * homa_log_throttled() - Print information to the system log about the - * RPCs on the throttled list. - * @homa: Overall information about the Homa transport. - */ -void homa_log_throttled(struct homa *homa) -{ - struct homa_rpc *rpc; - int rpcs = 0; - int64_t bytes = 0; - - printk(KERN_NOTICE "Printing throttled list\n"); - homa_throttle_lock(homa); - list_for_each_entry_rcu(rpc, &homa->throttled_rpcs, throttled_links) { - rpcs++; - if (!(spin_trylock_bh(rpc->lock))) { - printk(KERN_NOTICE "Skipping throttled RPC: locked\n"); - continue; - } - if (*rpc->msgout.next_xmit != NULL) - bytes += rpc->msgout.length - - rpc->msgout.next_xmit_offset; - if (rpcs <= 20) - homa_rpc_log(rpc); - homa_rpc_unlock(rpc); + if (homa_info->offset >= rpc->msgout.next_xmit_offset || + refcount_read(&skb->users) > 1) + return homa_info->offset; + skb = homa_info->next_skb; + rpc->msgout.first_not_tx = skb; } - homa_throttle_unlock(homa); - printk(KERN_NOTICE "Finished printing throttle list: %d rpcs, " - "%lld bytes\n", rpcs, bytes); + return rpc->msgout.length; } diff --git a/homa_pacer.c b/homa_pacer.c new file mode 100644 index 00000000..14c9ff71 --- /dev/null +++ b/homa_pacer.c @@ -0,0 +1,376 @@ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +/* This file implements the Homa pacer, which implements SRPT for packet + * output. In order to do that, it throttles packet transmission to prevent + * the buildup of large queues in the NIC. + */ + +#include "homa_impl.h" +#include "homa_pacer.h" +#include "homa_rpc.h" + +/** + * homa_pacer_alloc() - Allocate and initialize a new pacer object, which + * will hold pacer-related information for @homa. + * @homa: Homa transport that the pacer will be associated with. + * Return: A pointer to the new struct pacer, or a negative errno. + */ +struct homa_pacer *homa_pacer_alloc(struct homa *homa) +{ + struct homa_pacer *pacer; + int err; + + pacer = kzalloc(sizeof(*pacer), GFP_KERNEL); + if (!pacer) + return ERR_PTR(-ENOMEM); + pacer->homa = homa; + spin_lock_init(&pacer->mutex); + pacer->fifo_count = 1000; + spin_lock_init(&pacer->throttle_lock); + INIT_LIST_HEAD_RCU(&pacer->throttled_rpcs); + init_waitqueue_head(&pacer->wait_queue); + pacer->kthread = kthread_run(homa_pacer_main, pacer, "homa_pacer"); + if (IS_ERR(pacer->kthread)) { + err = PTR_ERR(pacer->kthread); + pr_err("Homa couldn't create pacer thread: error %d\n", err); + goto error; + } + atomic64_set(&pacer->link_idle_time, homa_clock()); + return pacer; + +error: + homa_pacer_free(pacer); + return ERR_PTR(err); +} + +/** + * homa_pacer_free() - Cleanup and free the pacer object for a Homa + * transport. + * @pacer: Object to destroy; caller must not reference the object + * again once this function returns. + */ +void homa_pacer_free(struct homa_pacer *pacer) +{ + if (pacer->kthread) { + kthread_stop(pacer->kthread); + pacer->kthread = NULL; + } + kfree(pacer); +} + +/** + * homa_pacer_check_nic_q() - This function is invoked before passing a + * packet to the NIC for transmission. It serves two purposes. First, it + * maintains an estimate of the NIC queue length. Second, it indicates to + * the caller whether the NIC queue is so full that no new packets should be + * queued (Homa's SRPT depends on keeping the NIC queue short). + * @pacer: Pacer information for a Homa transport. + * @skb: Packet that is about to be transmitted. + * @force: True means this packet is going to be transmitted + * regardless of the queue length. + * Return: Nonzero is returned if either the NIC queue length is + * acceptably short or @force was specified. 0 means that the + * NIC queue is at capacity or beyond, so the caller should delay + * the transmission of @skb. If nonzero is returned, then the + * queue estimate is updated to reflect the transmission of @skb. + */ +int homa_pacer_check_nic_q(struct homa_pacer *pacer, struct sk_buff *skb, + bool force) +{ + u64 idle, new_idle, clock, cycles_for_packet; + int bytes; + + bytes = homa_get_skb_info(skb)->wire_bytes; + cycles_for_packet = pacer->cycles_per_mbyte; + cycles_for_packet *= bytes; + do_div(cycles_for_packet, 1000000); + while (1) { + clock = homa_clock(); + idle = atomic64_read(&pacer->link_idle_time); + if ((clock + pacer->homa->qshared->max_nic_est_backlog_cycles) < + idle && !force && + !(pacer->homa->flags & HOMA_FLAG_DONT_THROTTLE)) + return 0; + if (!list_empty(&pacer->throttled_rpcs)) { + INC_METRIC(pacer_homa_packets, 1); + INC_METRIC(pacer_homa_bytes, bytes); + } + if (idle < clock) + new_idle = clock + cycles_for_packet; + else + new_idle = idle + cycles_for_packet; + + /* This method must be thread-safe. */ + if (atomic64_cmpxchg_relaxed(&pacer->link_idle_time, idle, + new_idle) == idle) + break; + } + return 1; +} + +/** + * homa_pacer_main() - Top-level function for the pacer thread. + * @arg: Pointer to pacer struct. + * + * Return: Always 0. + */ +int homa_pacer_main(void *arg) +{ + struct homa_pacer *pacer = arg; + int status; + u64 start; + + while (1) { + if (kthread_should_stop()) + break; + start = homa_clock(); + homa_pacer_xmit(pacer); + INC_METRIC(pacer_cycles, homa_clock() - start); + if (!list_empty(&pacer->throttled_rpcs)) { + /* NIC queue is full; before calling pacer again, + * give other threads a chance to run (otherwise + * low-level packet processing such as softirq could + * get locked out). + */ + schedule(); + continue; + } + + tt_record("pacer sleeping"); + status = wait_event_interruptible(pacer->wait_queue, + kthread_should_stop() || + !list_empty(&pacer->throttled_rpcs)); + tt_record1("pacer woke up with status %d", status); + if (status != 0 && status != -ERESTARTSYS) + break; + } + return 0; +} + +/** + * homa_pacer_xmit() - Transmit packets from the throttled list until + * either (a) the throttled list is empty or (b) the NIC queue has + * reached maximum allowable length. Note: this function may be invoked + * from either process context or softirq (BH) level. This function is + * invoked from multiple places, not just in the pacer thread. The reason + * for this is that (as of 10/2019) Linux's scheduling of the pacer thread + * is unpredictable: the thread may block for long periods of time (e.g., + * because it is assigned to the same CPU as a busy interrupt handler). + * This can result in poor utilization of the network link. So, this method + * gets invoked from other places as well, to increase the likelihood that we + * keep the link busy. Those other invocations are not guaranteed to happen, + * so the pacer thread provides a backstop. + * @pacer: Pacer information for a Homa transport. + */ +void homa_pacer_xmit(struct homa_pacer *pacer) +{ + struct homa_rpc *rpc; + s64 queue_cycles; + + /* Make sure only one instance of this function executes at a time. */ + if (!spin_trylock_bh(&pacer->mutex)) + return; + + while (1) { + queue_cycles = atomic64_read(&pacer->link_idle_time) - + homa_clock(); + if (queue_cycles >= + pacer->homa->qshared->max_nic_est_backlog_cycles) + break; + if (list_empty(&pacer->throttled_rpcs)) + break; + + /* Select an RPC to transmit (either SRPT or FIFO) and + * take a reference on it. Must do this while holding the + * throttle_lock to prevent the RPC from being reaped. Then + * release the throttle lock and lock the RPC (can't acquire + * the RPC lock while holding the throttle lock; see "Homa + * Locking Strategy" in homa_impl.h). + */ + homa_pacer_throttle_lock(pacer); + pacer->fifo_count -= pacer->homa->qshared->fifo_fraction; + if (pacer->fifo_count <= 0) { + struct homa_rpc *cur; + u64 oldest = ~0; + + pacer->fifo_count += 1000; + rpc = NULL; + list_for_each_entry(cur, &pacer->throttled_rpcs, + throttled_links) { + if (cur->msgout.init_time < oldest) { + rpc = cur; + oldest = cur->msgout.init_time; + } + } + } else { + rpc = list_first_entry_or_null(&pacer->throttled_rpcs, + struct homa_rpc, + throttled_links); + } + if (!rpc) { + homa_pacer_throttle_unlock(pacer); + break; + } + homa_rpc_hold(rpc); + homa_pacer_throttle_unlock(pacer); + homa_rpc_lock(rpc); + tt_record4("pacer calling homa_xmit_data for rpc id %llu, port %d, offset %d, bytes_left %d", + rpc->id, rpc->hsk->port, + rpc->msgout.next_xmit_offset, + rpc->msgout.length - rpc->msgout.next_xmit_offset); + homa_xmit_data(rpc, true); + + /* Note: rpc->state could be RPC_DEAD here, but the code + * below should work anyway. + */ + if (!*rpc->msgout.next_xmit || rpc->msgout.next_xmit_offset >= + rpc->msgout.granted) { + /* No more data can be transmitted from this message + * (right now), so remove it from the throttled list. + */ + tt_record2("pacer removing id %d from throttled list, offset %d", + rpc->id, rpc->msgout.next_xmit_offset); + homa_pacer_unmanage_rpc(rpc); + } + homa_rpc_unlock(rpc); + homa_rpc_put(rpc); + } + spin_unlock_bh(&pacer->mutex); +} + +/** + * homa_pacer_manage_rpc() - Arrange for the pacer to transmit packets + * from this RPC (make sure that an RPC is on the throttled list and wake up + * the pacer thread if necessary). + * @rpc: RPC with outbound packets that have been granted but can't be + * sent because of NIC queue restrictions. Must be locked by caller. + */ +void homa_pacer_manage_rpc(struct homa_rpc *rpc) + __must_hold(rpc->bucket->lock) +{ + struct homa_pacer *pacer = rpc->hsk->homa->pacer; + struct homa_rpc *candidate; + int bytes_left; + int checks = 0; + u64 now; + + if (!list_empty(&rpc->throttled_links)) + return; + now = homa_clock(); + if (!list_empty(&pacer->throttled_rpcs)) + INC_METRIC(nic_backlog_cycles, now - pacer->throttle_add); + pacer->throttle_add = now; + bytes_left = rpc->msgout.length - rpc->msgout.next_xmit_offset; + homa_pacer_throttle_lock(pacer); + list_for_each_entry(candidate, &pacer->throttled_rpcs, + throttled_links) { + int bytes_left_cand; + + checks++; + + /* Watch out: the pacer might have just transmitted the last + * packet from candidate. + */ + bytes_left_cand = candidate->msgout.length - + candidate->msgout.next_xmit_offset; + if (bytes_left_cand > bytes_left) { + list_add_tail(&rpc->throttled_links, + &candidate->throttled_links); + goto done; + } + } + list_add_tail(&rpc->throttled_links, &pacer->throttled_rpcs); +done: + homa_pacer_throttle_unlock(pacer); + wake_up(&pacer->wait_queue); + INC_METRIC(throttle_list_adds, 1); + INC_METRIC(throttle_list_checks, checks); +// tt_record("woke up pacer thread"); +} + +/** + * homa_pacer_unmanage_rpc() - Make sure that an RPC is no longer managed + * by the pacer. + * @rpc: RPC of interest. + */ +void homa_pacer_unmanage_rpc(struct homa_rpc *rpc) + __must_hold(rpc->bucket->lock) +{ + struct homa_pacer *pacer = rpc->hsk->homa->pacer; + + if (unlikely(!list_empty(&rpc->throttled_links))) { + UNIT_LOG("; ", "removing id %llu from throttled list", rpc->id); + homa_pacer_throttle_lock(pacer); + list_del_init(&rpc->throttled_links); + if (list_empty(&pacer->throttled_rpcs)) + INC_METRIC(nic_backlog_cycles, homa_clock() + - pacer->throttle_add); + homa_pacer_throttle_unlock(pacer); + } +} + +/** + * homa_pacer_update_sysctl_deps() - Update any pacer fields that depend + * on values set by sysctl. This function is invoked anytime a pacer sysctl + * value is updated. + * @pacer: Pacer to update. + */ +void homa_pacer_update_sysctl_deps(struct homa_pacer *pacer) +{ + u64 tmp; + + /* Underestimate link bandwidth (overestimate time) by 1%. */ + tmp = 101 * 8000 * (u64)homa_clock_khz(); + do_div(tmp, pacer->homa->link_mbps * 100); + pacer->cycles_per_mbyte = tmp; +} + +/** + * homa_pacer_log_throttled() - Print information to the system log about the + * RPCs on the throttled list. + * @pacer: Pacer information for a Homa transport. + */ +void homa_pacer_log_throttled(struct homa_pacer *pacer) +{ + struct homa_rpc *rpc; + s64 bytes = 0; + int rpcs = 0; + + pr_notice("Printing throttled list\n"); + homa_pacer_throttle_lock(pacer); + list_for_each_entry_rcu(rpc, &pacer->throttled_rpcs, throttled_links) { + rpcs++; + if (!homa_rpc_try_lock(rpc)) { + pr_notice("Skipping throttled RPC: locked\n"); + continue; + } + if (*rpc->msgout.next_xmit) + bytes += rpc->msgout.length + - rpc->msgout.next_xmit_offset; + if (rpcs <= 20) + homa_rpc_log(rpc); + homa_rpc_unlock(rpc); + } + homa_pacer_throttle_unlock(pacer); + pr_notice("Finished printing throttle list: %d rpcs, %lld bytes\n", + rpcs, bytes); +} + +/** + * homa_pacer_throttle_lock_slow() - This function implements the slow path for + * acquiring the throttle lock. It is invoked when the lock isn't immediately + * available. It waits for the lock, but also records statistics about + * the waiting time. + * @pacer: Pacer information for a Homa transport. + */ +void homa_pacer_throttle_lock_slow(struct homa_pacer *pacer) + __acquires(pacer->throttle_lock) +{ + u64 start = homa_clock(); + + tt_record("beginning wait for throttle lock"); + spin_lock_bh(&pacer->throttle_lock); + tt_record("ending wait for throttle lock"); + INC_METRIC(throttle_lock_misses, 1); + INC_METRIC(throttle_lock_miss_cycles, homa_clock() - start); +} diff --git a/homa_pacer.h b/homa_pacer.h new file mode 100644 index 00000000..0d537d4f --- /dev/null +++ b/homa_pacer.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ + +/* This file defines structs and functions related to the Homa pacer, + * which implements SRPT for packet output. In order to do that, it + * throttles packet transmission to prevent the buildup of + * large queues in the NIC. + */ + +#ifndef _HOMA_PACER_H +#define _HOMA_PACER_H + +#include "homa_impl.h" +#include "homa_qdisc.h" + +/** + * struct homa_pacer - Contains information that the pacer users to + * manage packet output. There is one instance of this object stored + * in each struct homa. + */ +struct homa_pacer { + /** @homa: Transport that this pacer is associated with. */ + struct homa *homa; + + /** + * @mutex: Ensures that only one instance of homa_pacer_xmit + * runs at a time. Only used in "try" mode: never block on this. + */ + spinlock_t mutex; + + /** + * @fifo_count: When this becomes <= zero, it's time for the + * pacer to allow the oldest RPC to transmit. + */ + int fifo_count; + + /** + * @throttle_lock: Used to synchronize access to @throttled_rpcs. Must + * hold when inserting or removing an RPC from throttled_rpcs. + */ + spinlock_t throttle_lock; + + /** + * @throttled_rpcs: Contains all homa_rpcs that have bytes ready + * for transmission, but which couldn't be sent without exceeding + * the NIC queue limit. + */ + struct list_head throttled_rpcs; + + /** + * @throttle_add: The most recent homa_clock() time when an RPC was + * added to @throttled_rpcs. + */ + u64 throttle_add; + + /** + * @cycles_per_mbyte: the number of homa_clock() cycles that it takes to + * transmit 10**6 bytes on our uplink. This is actually a slight + * overestimate of the value, to ensure that we don't underestimate + * NIC queue length and queue too many packets. + */ + u32 cycles_per_mbyte; + + /** + * @wait_queue: Used to block the pacer thread when there + * are no throttled RPCs. + */ + struct wait_queue_head wait_queue; + + /** + * @kthread: Kernel thread that transmits packets from + * throttled_rpcs in a way that limits queue buildup in the + * NIC. + */ + struct task_struct *kthread; + + /** + * @link_idle_time: The homa_clock() time at which we estimate + * that all of the packets we have passed to the NIC for transmission + * will have been transmitted. May be in the past. This estimate + * assumes that only Homa is transmitting data, so it could be a + * severe underestimate if there is competing traffic from, say, TCP. + */ + atomic64_t link_idle_time ____cacheline_aligned_in_smp; +}; + +struct homa_pacer *homa_pacer_alloc(struct homa *homa); +int homa_pacer_check_nic_q(struct homa_pacer *pacer, + struct sk_buff *skb, bool force); +int homa_pacer_dointvec(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +void homa_pacer_free(struct homa_pacer *pacer); +void homa_pacer_unmanage_rpc(struct homa_rpc *rpc); +void homa_pacer_log_throttled(struct homa_pacer *pacer); +int homa_pacer_main(void *transport); +void homa_pacer_manage_rpc(struct homa_rpc *rpc); +void homa_pacer_throttle_lock_slow(struct homa_pacer *pacer); +void homa_pacer_update_sysctl_deps(struct homa_pacer *pacer); +void homa_pacer_xmit(struct homa_pacer *pacer); + +/** + * homa_pacer_check() - This method is invoked at various places in Homa to + * see if the pacer needs to transmit more packets and, if so, transmit + * them. It's needed because the pacer thread may get descheduled by + * Linux, result in output stalls. + * @pacer: Pacer information for a Homa transport. + */ +static inline void homa_pacer_check(struct homa_pacer *pacer) +{ + if (list_empty(&pacer->throttled_rpcs)) + return; + + /* The ">> 1" in the line below gives homa_pacer_main the first chance + * to queue new packets; if the NIC queue becomes more than half + * empty, then we will help out here. + */ + if ((homa_clock() + (pacer->homa->qshared->max_nic_est_backlog_cycles >> + 1)) < atomic64_read(&pacer->link_idle_time)) + return; + tt_record("homa_check_pacer calling homa_pacer_xmit"); + homa_pacer_xmit(pacer); +} + +/** + * homa_pacer_throttle_lock() - Acquire the throttle lock. If the lock + * isn't immediately available, record stats on the waiting time. + * @pacer: Pacer information for a Homa transport. + */ +static inline void homa_pacer_throttle_lock(struct homa_pacer *pacer) + __acquires(pacer->throttle_lock) +{ + if (!spin_trylock_bh(&pacer->throttle_lock)) + homa_pacer_throttle_lock_slow(pacer); +} + +/** + * homa_pacer_throttle_unlock() - Release the throttle lock. + * @pacer: Pacer information for a Homa transport. + */ +static inline void homa_pacer_throttle_unlock(struct homa_pacer *pacer) + __releases(pacer->throttle_lock) +{ + spin_unlock_bh(&pacer->throttle_lock); +} + +#endif /* _HOMA_PACER_H */ diff --git a/homa_peer.c b/homa_peer.c new file mode 100644 index 00000000..6aa36a0c --- /dev/null +++ b/homa_peer.c @@ -0,0 +1,751 @@ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +/* This file provides functions related to homa_peer and homa_peertab + * objects. + */ + +#include "homa_impl.h" +#include "homa_peer.h" +#include "homa_rpc.h" +#include "murmurhash3.h" + +#ifdef __UNIT_TEST__ +#undef rhashtable_init +#define rhashtable_init mock_rht_init + +#undef rhashtable_lookup_get_insert_fast +#define rhashtable_lookup_get_insert_fast mock_rht_lookup_get_insert_fast + +#undef rhashtable_walk_next +#define rhashtable_walk_next mock_rht_walk_next +#endif /* __UNIT_TEST__ */ + +static const struct rhashtable_params ht_params = { + .key_len = sizeof(struct homa_peer_key), + .key_offset = offsetof(struct homa_peer, ht_key), + .head_offset = offsetof(struct homa_peer, ht_linkage), + .nelem_hint = 10000, + .hashfn = murmurhash3, + .obj_cmpfn = homa_peer_compare +}; + +#ifndef __STRIP__ /* See strip.py */ +/* Used to enable sysctl access to peertab-specific configuration parameters. + * The @data fields are actually offsets within a struct homa_peertab; these + * are converted to pointers into a struct peertab later. + */ +#define OFFSET(field) ((void *)offsetof(struct homa_peertab, field)) +static struct ctl_table peer_ctl_table[] = { + { + .procname = "peer_gc_threshold", + .data = OFFSET(gc_threshold), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_peer_dointvec + }, + { + .procname = "peer_idle_secs_min", + .data = OFFSET(idle_secs_min), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_peer_dointvec + }, + { + .procname = "peer_idle_secs_max", + .data = OFFSET(idle_secs_max), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_peer_dointvec + }, + { + .procname = "peer_net_max", + .data = OFFSET(net_max), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_peer_dointvec + }, +}; +#endif /* See strip.py */ + +/** + * homa_peer_alloc_peertab() - Allocate and initialize a homa_peertab. + * + * Return: A pointer to the new homa_peertab, or ERR_PTR(-errno) if there + * was a problem. + */ +struct homa_peertab *homa_peer_alloc_peertab(void) +{ + struct homa_peertab *peertab; + int err; + + peertab = kzalloc(sizeof(*peertab), GFP_KERNEL); + if (!peertab) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&peertab->lock); + err = rhashtable_init(&peertab->ht, &ht_params); + if (err) { + kfree(peertab); + return ERR_PTR(err); + } + peertab->ht_valid = true; + rhashtable_walk_enter(&peertab->ht, &peertab->ht_iter); + peertab->gc_threshold = 5000; + peertab->net_max = 10000; + peertab->idle_secs_min = 10; + peertab->idle_secs_max = 120; + +#ifndef __STRIP__ /* See strip.py */ + peertab->sysctl_header = register_net_sysctl(&init_net, "net/homa", + peer_ctl_table); + if (!peertab->sysctl_header) { + err = -ENOMEM; + pr_err("couldn't register sysctl parameters for Homa peertab\n"); + goto error; + } +#endif /* See strip.py */ + homa_peer_update_sysctl_deps(peertab); + return peertab; + +#ifndef __STRIP__ /* See strip.py */ +error: + homa_peer_free_peertab(peertab); + return ERR_PTR(err); +#endif /* See strip.py */ +} + +/** + * homa_peer_free_net() - Garbage collect all of the peer information + * associated with a particular network namespace. + * @hnet: Network namespace whose peers should be freed. There must not + * be any active sockets or RPCs for this namespace. + */ +void homa_peer_free_net(struct homa_net *hnet) +{ + struct homa_peertab *peertab = hnet->homa->peertab; + struct rhashtable_iter iter; + struct homa_peer *peer; + + spin_lock_bh(&peertab->lock); + peertab->gc_stop_count++; + spin_unlock_bh(&peertab->lock); + + rhashtable_walk_enter(&peertab->ht, &iter); + rhashtable_walk_start(&iter); + while (1) { + peer = rhashtable_walk_next(&iter); + if (!peer) + break; + if (IS_ERR(peer)) + continue; + if (peer->ht_key.hnet != hnet) + continue; + if (rhashtable_remove_fast(&peertab->ht, &peer->ht_linkage, + ht_params) == 0) { + homa_peer_release(peer); + hnet->num_peers--; + peertab->num_peers--; + } + } + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); + WARN(hnet->num_peers != 0, "%s ended up with hnet->num_peers %d", + __func__, hnet->num_peers); + + spin_lock_bh(&peertab->lock); + peertab->gc_stop_count--; + spin_unlock_bh(&peertab->lock); +} + +/** + * homa_peer_release_fn() - This function is invoked for each entry in + * the peer hash table by the rhashtable code when the table is being + * deleted. It frees its argument. + * @object: homa_peer to free. + * @dummy: Not used. + */ +void homa_peer_release_fn(void *object, void *dummy) +{ + struct homa_peer *peer = object; + + homa_peer_release(peer); +} + +/** + * homa_peer_free_peertab() - Destructor for homa_peertabs. + * @peertab: The table to destroy. + */ +void homa_peer_free_peertab(struct homa_peertab *peertab) +{ + spin_lock_bh(&peertab->lock); + peertab->gc_stop_count++; + spin_unlock_bh(&peertab->lock); + + if (peertab->ht_valid) { + rhashtable_walk_exit(&peertab->ht_iter); + rhashtable_free_and_destroy(&peertab->ht, homa_peer_release_fn, + NULL); + } +#ifndef __STRIP__ /* See strip.py */ + if (peertab->sysctl_header) { + unregister_net_sysctl_table(peertab->sysctl_header); + peertab->sysctl_header = NULL; + } +#endif /* See strip.py */ + kfree(peertab); +} + +/** + * homa_peer_prefer_evict() - Given two peers, determine which one is + * a better candidate for eviction. + * @peertab: Overall information used to manage peers. + * @peer1: First peer. + * @peer2: Second peer. + * Return: True if @peer1 is a better candidate for eviction than @peer2. + */ +int homa_peer_prefer_evict(struct homa_peertab *peertab, + struct homa_peer *peer1, + struct homa_peer *peer2) +{ + /* Prefer a peer whose homa-net is over its limit; if both are either + * over or under, then prefer the peer with the shortest idle time. + */ + if (peer1->ht_key.hnet->num_peers > peertab->net_max) { + if (peer2->ht_key.hnet->num_peers <= peertab->net_max) + return true; + else + return peer1->access_jiffies < peer2->access_jiffies; + } + if (peer2->ht_key.hnet->num_peers > peertab->net_max) + return false; + else + return peer1->access_jiffies < peer2->access_jiffies; +} + +/** + * homa_peer_pick_victims() - Select a few peers that can be freed. + * @peertab: Choose peers that are stored here. + * @victims: Return addresses of victims here. + * @max_victims: Limit on how many victims to choose (and size of @victims + * array). + * Return: The number of peers stored in @victims; may be zero. + */ +int homa_peer_pick_victims(struct homa_peertab *peertab, + struct homa_peer *victims[], int max_victims) +{ + struct homa_peer *peer; + int num_victims = 0; + int to_scan; + int i, idle; + + /* Scan 2 peers for every potential victim and keep the "best" + * peers for removal. + */ + rhashtable_walk_start(&peertab->ht_iter); + for (to_scan = 2 * max_victims; to_scan > 0; to_scan--) { + peer = rhashtable_walk_next(&peertab->ht_iter); + if (!peer) { + /* Reached the end of the table; restart at + * the beginning. + */ + rhashtable_walk_stop(&peertab->ht_iter); + rhashtable_walk_exit(&peertab->ht_iter); + rhashtable_walk_enter(&peertab->ht, &peertab->ht_iter); + rhashtable_walk_start(&peertab->ht_iter); + peer = rhashtable_walk_next(&peertab->ht_iter); + if (!peer) + break; + } + if (IS_ERR(peer)) { + /* rhashtable decided to restart the search at the + * beginning. + */ + peer = rhashtable_walk_next(&peertab->ht_iter); + if (!peer || IS_ERR(peer)) + break; + } + + /* Has this peer been idle long enough to be candidate for + * eviction? + */ + idle = jiffies - peer->access_jiffies; + if (idle < peertab->idle_jiffies_min) + continue; + if (idle < peertab->idle_jiffies_max && + peer->ht_key.hnet->num_peers <= peertab->net_max) + continue; + + /* Sort the candidate into the existing list of victims. */ + for (i = 0; i < num_victims; i++) { + if (peer == victims[i]) { + /* This can happen if there aren't very many + * peers and we wrapped around in the hash + * table. + */ + peer = NULL; + break; + } + if (homa_peer_prefer_evict(peertab, peer, victims[i])) { + struct homa_peer *tmp; + + tmp = victims[i]; + victims[i] = peer; + peer = tmp; + } + } + + if (num_victims < max_victims && peer) { + victims[num_victims] = peer; + num_victims++; + } + } + rhashtable_walk_stop(&peertab->ht_iter); + return num_victims; +} + +/** + * homa_peer_gc() - This function is invoked by Homa at regular intervals; + * its job is to ensure that the number of peers stays within limits. + * If the number grows too large, it selectively deletes peers to get + * back under the limit. + * @peertab: Structure whose peers should be considered for garbage + * collection. + */ +void homa_peer_gc(struct homa_peertab *peertab) +{ +#define EVICT_BATCH_SIZE 5 + struct homa_peer *victims[EVICT_BATCH_SIZE]; + int num_victims; + int i; + + spin_lock_bh(&peertab->lock); + if (peertab->gc_stop_count != 0) + goto done; + if (peertab->num_peers < peertab->gc_threshold) + goto done; + num_victims = homa_peer_pick_victims(peertab, victims, + EVICT_BATCH_SIZE); + if (num_victims == 0) + goto done; + + for (i = 0; i < num_victims; i++) { + struct homa_peer *peer = victims[i]; + + if (rhashtable_remove_fast(&peertab->ht, &peer->ht_linkage, + ht_params) == 0) { + homa_peer_release(peer); + peertab->num_peers--; + peer->ht_key.hnet->num_peers--; + tt_record1("homa_peer_gc removed homa_peer 0x%x", + tt_addr(peer->addr)); + } + } +done: + spin_unlock_bh(&peertab->lock); +} + +/** + * homa_peer_alloc() - Allocate and initialize a new homa_peer object. + * @hsk: Socket for which the peer will be used. + * @addr: Address of the desired host: IPv4 addresses are represented + * as IPv4-mapped IPv6 addresses. + * Return: The peer associated with @addr, or a negative errno if an + * error occurred. On a successful return the reference count + * will be incremented for the returned peer. Sets hsk->error_msg + * on errors. + */ +struct homa_peer *homa_peer_alloc(struct homa_sock *hsk, + const struct in6_addr *addr) +{ + struct homa_peer *peer; + int status; + + peer = kzalloc(sizeof(*peer), GFP_ATOMIC); + if (!peer) { + INC_METRIC(peer_kmalloc_errors, 1); + hsk->error_msg = "couldn't allocate memory for homa_peer"; + return (struct homa_peer *)ERR_PTR(-ENOMEM); + } + peer->ht_key.addr = *addr; + peer->ht_key.hnet = hsk->hnet; + refcount_set(&peer->refs, 1); + peer->access_jiffies = jiffies; + spin_lock_init(&peer->lock); +#ifndef __STRIP__ /* See strip.py */ + peer->unsched_cutoffs[HOMA_MAX_PRIORITIES - 1] = 0; + peer->unsched_cutoffs[HOMA_MAX_PRIORITIES - 2] = INT_MAX; + INIT_LIST_HEAD(&peer->grantable_rpcs); + INIT_LIST_HEAD(&peer->grantable_links); +#endif /* See strip.py */ + peer->current_ticks = -1; + + status = homa_peer_reset_dst(peer, hsk); + if (status != 0) { + hsk->error_msg = "couldn't find route for peer"; + kfree(peer); + return ERR_PTR(status); + } + tt_record1("Allocated new homa_peer for node 0x%x", + tt_addr(peer->addr)); + INC_METRIC(peer_allocs, 1); + return peer; +} + +/** + * homa_peer_free() - Release any resources in a peer and free the homa_peer + * struct. Invoked by the RCU mechanism via homa_peer_release. + * @head: Pointer to the rcu_head field of the peer to free. + */ +void homa_peer_free(struct rcu_head *head) +{ + struct homa_peer *peer; + + peer = container_of(head, struct homa_peer, rcu_head); + dst_release(rcu_dereference(peer->dst)); + kfree(peer); +} + +/** + * homa_peer_get() - Returns the peer associated with a given host; creates + * a new homa_peer if one doesn't already exist. + * @hsk: Socket where the peer will be used. + * @addr: Address of the desired host: IPv4 addresses are represented + * as IPv4-mapped IPv6 addresses. + * + * Return: The peer associated with @addr, or a negative errno if an + * error occurred. On a successful return the reference count + * will be incremented for the returned peer. The caller must + * eventually call homa_peer_release to release the reference. + */ +struct homa_peer *homa_peer_get(struct homa_sock *hsk, + const struct in6_addr *addr) +{ + struct homa_peertab *peertab = hsk->homa->peertab; + struct homa_peer *peer, *other; + struct homa_peer_key key; + + key.addr = *addr; + key.hnet = hsk->hnet; + rcu_read_lock(); + peer = rhashtable_lookup(&peertab->ht, &key, ht_params); + if (peer) { + homa_peer_hold(peer); + peer->access_jiffies = jiffies; + rcu_read_unlock(); + return peer; + } + + /* No existing entry, so we have to create a new one. */ + peer = homa_peer_alloc(hsk, addr); + if (IS_ERR(peer)) { + rcu_read_unlock(); + return peer; + } + spin_lock_bh(&peertab->lock); + other = rhashtable_lookup_get_insert_fast(&peertab->ht, + &peer->ht_linkage, ht_params); + if (IS_ERR(other)) { + /* Couldn't insert; return the error info. */ + homa_peer_release(peer); + peer = other; + } else if (other) { + /* Someone else already created the desired peer; use that + * one instead of ours. + */ + homa_peer_release(peer); + homa_peer_hold(other); + peer = other; + peer->access_jiffies = jiffies; + } else { + homa_peer_hold(peer); + peertab->num_peers++; + key.hnet->num_peers++; + } + spin_unlock_bh(&peertab->lock); + rcu_read_unlock(); + return peer; +} + +/** + * homa_get_dst() - Returns destination information associated with a peer, + * updating it if the cached information is stale. + * @peer: Peer whose destination information is desired. + * @hsk: Homa socket with which the dst will be used; needed by lower-level + * code to recreate the dst. + * Return: Up-to-date destination for peer; a reference has been taken + * on this dst_entry, which the caller must eventually release. + */ +struct dst_entry *homa_get_dst(struct homa_peer *peer, struct homa_sock *hsk) +{ + struct dst_entry *dst; + int pass; + + rcu_read_lock(); + for (pass = 0; ; pass++) { + do { + /* This loop repeats only if we happen to fetch + * the dst right when it is being reset. + */ + dst = rcu_dereference(peer->dst); + } while (!dst_hold_safe(dst)); + + /* After the first pass it's OK to return an obsolete dst + * (we're basically giving up; continuing could result in + * an infinite loop if homa_dst_refresh can't create a new dst). + */ + if (dst_check(dst, peer->dst_cookie) || pass > 0) + break; + dst_release(dst); + INC_METRIC(peer_dst_refreshes, 1); + homa_peer_reset_dst(peer, hsk); + } + rcu_read_unlock(); + return dst; +} + +/** + * homa_peer_reset_dst() - Find an appropriate dst_entry for a peer and + * store it in the peer's dst field. If the field is already set, the + * current value is assumed to be stale and will be discarded if a new + * dst_entry can be created. + * @peer: The peer whose dst field should be reset. + * @hsk: Socket that will be used for sending packets. + * Return: Zero for success, or a negative errno if there was an error + * (in which case the existing value for the dst field is left + * in place). + */ +int homa_peer_reset_dst(struct homa_peer *peer, struct homa_sock *hsk) +{ + struct dst_entry *dst; + int result = 0; + + homa_peer_lock(peer); + memset(&peer->flow, 0, sizeof(peer->flow)); + if (hsk->sock.sk_family == AF_INET) { + struct rtable *rt; + + flowi4_init_output(&peer->flow.u.ip4, hsk->sock.sk_bound_dev_if, + hsk->sock.sk_mark, hsk->inet.tos, + RT_SCOPE_UNIVERSE, hsk->sock.sk_protocol, 0, + ipv6_to_ipv4(peer->addr), + hsk->inet.inet_saddr, 0, 0, + hsk->sock.sk_uid); + security_sk_classify_flow(&hsk->sock, + &peer->flow.u.__fl_common); + rt = ip_route_output_flow(sock_net(&hsk->sock), + &peer->flow.u.ip4, &hsk->sock); + if (IS_ERR(rt)) { + result = PTR_ERR(rt); + INC_METRIC(peer_route_errors, 1); + goto done; + } + dst = &rt->dst; + peer->dst_cookie = 0; + } else { + /* This code is derived from code in tcp_v6_connect. */ + peer->flow.u.ip6.flowi6_proto = hsk->sock.sk_protocol; + peer->flow.u.ip6.daddr = peer->addr; + peer->flow.u.ip6.saddr = hsk->inet.pinet6->saddr; + peer->flow.u.ip6.flowlabel = ip6_make_flowinfo(hsk->inet.tos, + 0); + peer->flow.u.ip6.flowi6_oif = hsk->sock.sk_bound_dev_if; + peer->flow.u.ip6.flowi6_mark = hsk->sock.sk_mark; + peer->flow.u.ip6.fl6_dport = 0; + peer->flow.u.ip6.fl6_sport = 0; + peer->flow.u.ip6.flowi6_uid = hsk->sock.sk_uid; + security_sk_classify_flow(&hsk->sock, + &peer->flow.u.__fl_common); + dst = ip6_dst_lookup_flow(sock_net(&hsk->sock), &hsk->sock, + &peer->flow.u.ip6, NULL); + if (IS_ERR(dst)) { + result = PTR_ERR(dst); + INC_METRIC(peer_route_errors, 1); + goto done; + } + peer->dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); + } + + /* From the standpoint of homa_get_dst, peer->dst is not updated + * atomically with peer->dst_cookie, which means homa_get_dst could + * use a new cookie with an old dest. Fortunately, this is benign; at + * worst, it might cause an obsolete dst to be reused (resulting in + * a lost packet) or a valid dst to be replaced (resulting in + * unnecessary work). + */ + dst_release(rcu_replace_pointer(peer->dst, dst, true)); + +done: + homa_peer_unlock(peer); + return result; +} + +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_unsched_priority() - Returns the priority level to use for + * unscheduled packets of a message. + * @homa: Overall data about the Homa protocol implementation. + * @peer: The destination of the message. + * @length: Number of bytes in the message. + * + * Return: A priority level. + */ +int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, + int length) +{ + int i; + + for (i = homa->num_priorities - 1; ; i--) { + if (peer->unsched_cutoffs[i] >= length) + return i; + } + /* Can't ever get here */ +} + +/** + * homa_peer_set_cutoffs() - Set the cutoffs for unscheduled priorities in + * a peer object. This is a convenience function used primarily by unit tests. + * @peer: Homa_peer object whose cutoffs should be set. + * @c0: Largest message size that will use priority 0. + * @c1: Largest message size that will use priority 1. + * @c2: Largest message size that will use priority 2. + * @c3: Largest message size that will use priority 3. + * @c4: Largest message size that will use priority 4. + * @c5: Largest message size that will use priority 5. + * @c6: Largest message size that will use priority 6. + * @c7: Largest message size that will use priority 7. + */ +void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, int c2, + int c3, int c4, int c5, int c6, int c7) +{ + peer->unsched_cutoffs[0] = c0; + peer->unsched_cutoffs[1] = c1; + peer->unsched_cutoffs[2] = c2; + peer->unsched_cutoffs[3] = c3; + peer->unsched_cutoffs[4] = c4; + peer->unsched_cutoffs[5] = c5; + peer->unsched_cutoffs[6] = c6; + peer->unsched_cutoffs[7] = c7; +} + +/** + * homa_peer_lock_slow() - This function implements the slow path for + * acquiring a peer's @lock. It is invoked when the lock isn't + * immediately available. It waits for the lock, but also records statistics + * about the waiting time. + * @peer: Peer to lock. + */ +void homa_peer_lock_slow(struct homa_peer *peer) + __acquires(peer->lock) +{ + u64 start = homa_clock(); + + tt_record("beginning wait for peer lock"); + spin_lock_bh(&peer->lock); + tt_record("ending wait for peer lock"); + INC_METRIC(peer_ack_lock_misses, 1); + INC_METRIC(peer_ack_lock_miss_cycles, homa_clock() - start); +} +#endif /* See strip.py */ + +/** + * homa_peer_add_ack() - Add a given RPC to the list of unacked + * RPCs for its server. Once this method has been invoked, it's safe + * to delete the RPC, since it will eventually be acked to the server. + * @rpc: Client RPC that has now completed. + */ +void homa_peer_add_ack(struct homa_rpc *rpc) +{ + struct homa_peer *peer = rpc->peer; + struct homa_ack_hdr ack; + + homa_peer_lock(peer); + if (peer->num_acks < HOMA_MAX_ACKS_PER_PKT) { + peer->acks[peer->num_acks].client_id = cpu_to_be64(rpc->id); + peer->acks[peer->num_acks].server_port = htons(rpc->dport); + peer->num_acks++; + homa_peer_unlock(peer); + return; + } + + /* The peer has filled up; send an ACK message to empty it. The + * RPC in the message header will also be considered ACKed. + */ + INC_METRIC(ack_overflows, 1); + memcpy(ack.acks, peer->acks, sizeof(peer->acks)); + ack.num_acks = htons(peer->num_acks); + peer->num_acks = 0; + homa_peer_unlock(peer); + homa_xmit_control(ACK, &ack, sizeof(ack), rpc); +} + +/** + * homa_peer_get_acks() - Copy acks out of a peer, and remove them from the + * peer. + * @peer: Peer to check for possible unacked RPCs. + * @count: Maximum number of acks to return. + * @dst: The acks are copied to this location. + * + * Return: The number of acks extracted from the peer (<= count). + */ +int homa_peer_get_acks(struct homa_peer *peer, int count, struct homa_ack *dst) +{ + /* Don't waste time acquiring the lock if there are no ids available. */ + if (peer->num_acks == 0) + return 0; + + homa_peer_lock(peer); + + if (count > peer->num_acks) + count = peer->num_acks; + memcpy(dst, &peer->acks[peer->num_acks - count], + count * sizeof(peer->acks[0])); + peer->num_acks -= count; + + homa_peer_unlock(peer); + return count; +} + +/** + * homa_peer_update_sysctl_deps() - Update any peertab fields that depend + * on values set by sysctl. This function is invoked anytime a peer sysctl + * value is updated. + * @peertab: Struct to update. + */ +void homa_peer_update_sysctl_deps(struct homa_peertab *peertab) +{ + peertab->idle_jiffies_min = peertab->idle_secs_min * HZ; + peertab->idle_jiffies_max = peertab->idle_secs_max * HZ; +} + +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_peer_dointvec() - This function is a wrapper around proc_dointvec. It + * is invoked to read and write peer-related sysctl values. + * @table: sysctl table describing value to be read or written. + * @write: Nonzero means value is being written, 0 means read. + * @buffer: Address in user space of the input/output data. + * @lenp: Not exactly sure. + * @ppos: Not exactly sure. + * + * Return: 0 for success, nonzero for error. + */ +int homa_peer_dointvec(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct homa_peertab *peertab; + struct ctl_table table_copy; + int result; + + peertab = homa_net(current->nsproxy->net_ns)->homa->peertab; + + /* Generate a new ctl_table that refers to a field in the + * net-specific struct homa. + */ + table_copy = *table; + table_copy.data = ((char *)peertab) + (uintptr_t)table_copy.data; + + result = proc_dointvec(&table_copy, write, buffer, lenp, ppos); + homa_peer_update_sysctl_deps(peertab); + return result; +} +#endif /* See strip.py */ diff --git a/homa_peer.h b/homa_peer.h new file mode 100644 index 00000000..65d6701a --- /dev/null +++ b/homa_peer.h @@ -0,0 +1,387 @@ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ + +/* This file contains definitions related to managing peers (homa_peer + * and homa_peertab). + */ + +#ifndef _HOMA_PEER_H +#define _HOMA_PEER_H + +#include "homa_wire.h" +#include "homa_sock.h" + +#include + +struct homa_rpc; + +/** + * struct homa_peertab - Stores homa_peer objects, indexed by IPv6 + * address. + */ +struct homa_peertab { + /** + * @lock: Used to synchronize updates to @ht as well as other + * operations on this object. + */ + spinlock_t lock; + + /** @ht: Hash table that stores all struct peers. */ + struct rhashtable ht; + + /** @ht_iter: Used to scan ht to find peers to garbage collect. */ + struct rhashtable_iter ht_iter; + + /** @num_peers: Total number of peers currently in @ht. */ + int num_peers; + + /** + * @ht_valid: True means ht and ht_iter have been initialized and must + * eventually be destroyed. + */ + bool ht_valid; + + /** @rcu_head: Holds state of a pending call_rcu invocation. */ + struct rcu_head rcu_head; + + /** + * @gc_stop_count: Nonzero means that peer garbage collection + * should not be performed (conflicting state changes are underway). + */ + int gc_stop_count; + + /** + * @gc_threshold: If @num_peers is less than this, don't bother + * doing any peer garbage collection. Set externally via sysctl. + */ + int gc_threshold; + + /** + * @net_max: If the number of peers for a homa_net exceeds this number, + * work aggressively to reclaim peers for that homa_net. Set + * externally via sysctl. + */ + int net_max; + + /** + * @idle_secs_min: A peer will not be considered for garbage collection + * under any circumstances if it has been idle less than this many + * seconds. Set externally via sysctl. + */ + int idle_secs_min; + + /** + * @idle_jiffies_min: Same as idle_secs_min except in units + * of jiffies. + */ + unsigned long idle_jiffies_min; + + /** + * @idle_secs_max: A peer that has been idle for less than + * this many seconds will not be considered for garbage collection + * unless its homa_net has more than @net_threshold peers. Set + * externally via sysctl. + */ + int idle_secs_max; + + /** + * @idle_jiffies_max: Same as idle_secs_max except in units + * of jiffies. + */ + unsigned long idle_jiffies_max; + +#ifndef __STRIP__ /* See strip.py */ + /** + * @sysctl_header: Used to remove sysctl values when this structure + * is destroyed. + */ + struct ctl_table_header *sysctl_header; +#endif /* See strip.py */ +}; + +/** + * struct homa_peer_key - Used to look up homa_peer structs in an rhashtable. + */ +struct homa_peer_key { + /** + * @addr: Address of the desired host. IPv4 addresses are represented + * with IPv4-mapped IPv6 addresses. Must be the first variable in + * the struct, because of union in homa_peer. + */ + struct in6_addr addr; + + /** @hnet: The network namespace in which this peer is valid. */ + struct homa_net *hnet; +}; + +/** + * struct homa_peer - One of these objects exists for each machine that we + * have communicated with (either as client or server). + */ +struct homa_peer { + union { + /** + * @addr: IPv6 address for the machine (IPv4 addresses are + * stored as IPv4-mapped IPv6 addresses). + */ + struct in6_addr addr; + + /** @ht_key: The hash table key for this peer in peertab->ht. */ + struct homa_peer_key ht_key; + }; + + /** + * @refs: Number of outstanding references to this peer. Includes + * one reference for the entry in peertab->ht, plus one for each + * unmatched call to homa_peer_hold; the peer gets freed when + * this value becomes zero. + */ + refcount_t refs; + + /** + * @access_jiffies: Time in jiffies of most recent access to this + * peer. + */ + unsigned long access_jiffies; + + /** + * @ht_linkage: Used by rashtable implement to link this peer into + * peertab->ht. + */ + struct rhash_head ht_linkage; + + /** + * @lock: used to synchronize access to fields in this struct, such + * as @num_acks, @acks, @dst, and @dst_cookie. + */ + spinlock_t lock ____cacheline_aligned_in_smp; + + /** + * @num_acks: the number of (initial) entries in @acks that + * currently hold valid information. + */ + int num_acks; + + /** + * @acks: info about client RPCs whose results have been completely + * received. + */ + struct homa_ack acks[HOMA_MAX_ACKS_PER_PKT]; + + /** + * @dst: Used to route packets to this peer; this object owns a + * reference that must eventually be released. + */ + struct dst_entry __rcu *dst; + + /** + * @dst_cookie: Used to check whether dst is still valid. This is + * accessed without synchronization, which is racy, but the worst + * that can happen is using an obsolete dst. + */ + u32 dst_cookie; + + /** + * @flow: Addressing info used to create @dst and also required + * when transmitting packets. + */ + struct flowi flow; + +#ifndef __STRIP__ /* See strip.py */ + /** + * @unsched_cutoffs: priorities to use for unscheduled packets + * sent to this host, as specified in the most recent CUTOFFS + * packet from that host. See documentation for @homa.unsched_cutoffs + * for the meanings of these values. + */ + int unsched_cutoffs[HOMA_MAX_PRIORITIES]; + + /** + * @cutoff_version: value of cutoff_version in the most recent + * CUTOFFS packet received from this peer. 0 means we haven't + * yet received a CUTOFFS packet from the host. Note that this is + * stored in network byte order. + */ + __be16 cutoff_version; + + /** + * @last_update_jiffies: time in jiffies when we sent the most + * recent CUTOFFS packet to this peer. + */ + unsigned long last_update_jiffies; + + /** + * @active_rpcs: Number of RPCs involving this peer whose incoming + * messages are currently in homa->grant->active_rpcs. Managed by + * homa_grant.c under the grant lock. + */ + int active_rpcs; + + /** + * @grantable_rpcs: Contains homa_rpcs (both requests and responses) + * involving this peer that are not in homa->active_rpcs but + * whose msgins eventually need more grants. The list is sorted in + * priority order (head has fewest ungranted bytes). Managed by + * homa_grant.c under the grant lock. If this list is nonempty + * then refs will be nonzero. + */ + struct list_head grantable_rpcs; + + /** + * @grantable_links: Used to link this peer into homa->grantable_peers. + * If this RPC is not linked into homa->grantable_peers, this is an + * empty list pointing to itself. Managed by homa_grant.c under the + * grant lock. If this list is nonempty then refs will be nonzero. + */ + struct list_head grantable_links; +#endif /* See strip.py */ + + /** + * @outstanding_resends: the number of resend requests we have + * sent to this server (spaced @homa.resend_interval apart) since + * we received a packet from this peer. + */ + int outstanding_resends; + + /** + * @most_recent_resend: @homa->timer_ticks when the most recent + * resend was sent to this peer. + */ + int most_recent_resend; + + /** + * @least_recent_rpc: of all the RPCs for this peer scanned at + * @current_ticks, this is the RPC whose @resend_timer_ticks + * is farthest in the past. + */ + struct homa_rpc *least_recent_rpc; + + /** + * @least_recent_ticks: the @resend_timer_ticks value for + * @least_recent_rpc. + */ + u32 least_recent_ticks; + + /** + * @current_ticks: the value of @homa->timer_ticks the last time + * that @least_recent_rpc and @least_recent_ticks were computed. + * Used to detect the start of a new homa_timer pass. + */ + u32 current_ticks; + + /** + * @resend_rpc: the value of @least_recent_rpc computed in the + * previous homa_timer pass. This RPC will be issued a RESEND + * in the current pass, if it still needs one. + */ + struct homa_rpc *resend_rpc; + + /** @rcu_head: Holds state of a pending call_rcu invocation. */ + struct rcu_head rcu_head; +}; + +void homa_dst_refresh(struct homa_peertab *peertab, + struct homa_peer *peer, struct homa_sock *hsk); +struct dst_entry + *homa_get_dst(struct homa_peer *peer, struct homa_sock *hsk); +void homa_peer_add_ack(struct homa_rpc *rpc); +struct homa_peer + *homa_peer_alloc(struct homa_sock *hsk, const struct in6_addr *addr); +struct homa_peertab + *homa_peer_alloc_peertab(void); +int homa_peer_dointvec(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +void homa_peer_free(struct rcu_head *head); +void homa_peer_free_net(struct homa_net *hnet); +void homa_peer_free_peertab(struct homa_peertab *peertab); +void homa_peer_gc(struct homa_peertab *peertab); +struct homa_peer + *homa_peer_get(struct homa_sock *hsk, const struct in6_addr *addr); +int homa_peer_get_acks(struct homa_peer *peer, int count, + struct homa_ack *dst); +int homa_peer_pick_victims(struct homa_peertab *peertab, + struct homa_peer *victims[], int max_victims); +int homa_peer_prefer_evict(struct homa_peertab *peertab, + struct homa_peer *peer1, + struct homa_peer *peer2); +void homa_peer_release_fn(void *object, void *dummy); +int homa_peer_reset_dst(struct homa_peer *peer, struct homa_sock *hsk); +void homa_peer_update_sysctl_deps(struct homa_peertab *peertab); +#ifndef __STRIP__ /* See strip.py */ +void homa_peer_lock_slow(struct homa_peer *peer); +void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, + int c2, int c3, int c4, int c5, int c6, int c7); +#endif /* See strip.py */ + +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_peer_lock() - Acquire the lock for a peer. If the lock isn't + * immediately available, record stats on the waiting time. + * @peer: Peer to lock. + */ +static inline void homa_peer_lock(struct homa_peer *peer) + __acquires(peer->lock) +{ + if (!spin_trylock_bh(&peer->lock)) + homa_peer_lock_slow(peer); +} +#else /* See strip.py */ +/** + * homa_peer_lock() - Acquire the lock for a peer. + * @peer: Peer to lock. + */ +static inline void homa_peer_lock(struct homa_peer *peer) + __acquires(peer->lock) +{ + spin_lock_bh(&peer->lock); +} +#endif /* See strip.py */ + +/** + * homa_peer_unlock() - Release the lock for a peer. + * @peer: Peer to lock. + */ +static inline void homa_peer_unlock(struct homa_peer *peer) + __releases(peer->lock) +{ + spin_unlock_bh(&peer->lock); +} + +/** + * homa_peer_hold() - Increment the reference count on an RPC, which will + * prevent it from being freed until homa_peer_release() is called. + * @peer: Object on which to take a reference. + */ +static inline void homa_peer_hold(struct homa_peer *peer) +{ + refcount_inc(&peer->refs); +} + +/** + * homa_peer_release() - Release a reference on a peer (cancels the effect of + * a previous call to homa_peer_hold). If the reference count becomes zero + * then the peer may be deleted at any time. + * @peer: Object to release. + */ +static inline void homa_peer_release(struct homa_peer *peer) +{ + if (refcount_dec_and_test(&peer->refs)) + call_rcu(&peer->rcu_head, homa_peer_free); +} + +/** + * homa_peer_compare() - Comparison function for entries in @peertab->ht. + * @arg: Contains one of the keys to compare. + * @obj: homa_peer object containing the other key to compare. + * Return: 0 means the keys match, 1 means mismatch. + */ +static inline int homa_peer_compare(struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct homa_peer_key *key = arg->key; + const struct homa_peer *peer = obj; + + return !(ipv6_addr_equal(&key->addr, &peer->ht_key.addr) && + peer->ht_key.hnet == key->hnet); +} + +#endif /* _HOMA_PEER_H */ diff --git a/homa_peertab.c b/homa_peertab.c deleted file mode 100644 index 2c2e0b9d..00000000 --- a/homa_peertab.c +++ /dev/null @@ -1,388 +0,0 @@ -/* Copyright (c) 2019-2022, Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -/* This file manages homa_peertab objects and is responsible for creating - * and deleting homa_peer objects. - */ - -#include "homa_impl.h" - -/** - * homa_peertab_init() - Constructor for homa_peertabs. - * @peertab: The object to initialize; previous contents are discarded. - * - * Return: 0 in the normal case, or a negative errno if there was a problem. - */ -int homa_peertab_init(struct homa_peertab *peertab) -{ - /* Note: when we return, the object must be initialized so it's - * safe to call homa_peertab_destroy, even if this function returns - * an error. - */ - int i; - spin_lock_init(&peertab->write_lock); - INIT_LIST_HEAD(&peertab->dead_dsts); - peertab->buckets = (struct hlist_head *) vmalloc( - HOMA_PEERTAB_BUCKETS * sizeof(*peertab->buckets)); - if (!peertab->buckets) - return -ENOMEM; - for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) { - INIT_HLIST_HEAD(&peertab->buckets[i]); - } - return 0; -} - -/** - * homa_peertab_destroy() - Destructor for homa_peertabs. After this - * function returns, it is unsafe to use any results from previous calls - * to homa_peer_find, since all existing homa_peer objects will have been - * destroyed. - * @peertab: The table to destroy. - */ -void homa_peertab_destroy(struct homa_peertab *peertab) -{ - int i; - struct homa_peer *peer; - struct hlist_node *next; - if (!peertab->buckets) - return; - - for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) { - hlist_for_each_entry_safe(peer, next, &peertab->buckets[i], - peertab_links) { - dst_release(peer->dst); - kfree(peer); - } - } - vfree(peertab->buckets); - homa_peertab_gc_dsts(peertab, ~0); -} - -/** - * homa_peertab_gc_dsts() - Invoked to free unused dst_entries, if it is - * safe to do so. - * @peertab: The table in which to free entries. - * @now: Current time, in get_cycles units; entries with expiration - * dates no later than this will be freed. Specify ~0 to - * free all entries. - */ -void homa_peertab_gc_dsts(struct homa_peertab *peertab, __u64 now) -{ - while (!list_empty(&peertab->dead_dsts)) { - struct homa_dead_dst *dead = list_first_entry( - &peertab->dead_dsts, struct homa_dead_dst, - dst_links); - if (dead->gc_time > now) - break; - dst_release(dead->dst); - list_del(&dead->dst_links); - kfree(dead); - } -} - -/** - * homa_peer_find() - Returns the peer associated with a given host; creates - * a new homa_peer if one doesn't already exist. - * @peertab: Peer table in which to perform lookup. - * @addr: Address of the desired host: IPv4 addresses are represented - * as IPv4-mapped IPv6 addresses. - * @inet: Socket that will be used for sending packets. - * - * Return: The peer associated with @addr, or a negative errno if an - * error occurred. The caller can retain this pointer - * indefinitely: peer entries are never deleted except in - * homa_peertab_destroy. - */ -struct homa_peer *homa_peer_find(struct homa_peertab *peertab, - const struct in6_addr *addr, struct inet_sock *inet) -{ - /* Note: this function uses RCU operators to ensure safety even - * if a concurrent call is adding a new entry. - */ - struct homa_peer *peer; - struct dst_entry *dst; - // Should use siphash or jhash here: - __u32 bucket = hash_32(addr->in6_u.u6_addr32[0], HOMA_PEERTAB_BUCKET_BITS); - bucket ^= hash_32(addr->in6_u.u6_addr32[1], HOMA_PEERTAB_BUCKET_BITS); - bucket ^= hash_32(addr->in6_u.u6_addr32[2], HOMA_PEERTAB_BUCKET_BITS); - bucket ^= hash_32(addr->in6_u.u6_addr32[3], HOMA_PEERTAB_BUCKET_BITS); - hlist_for_each_entry_rcu(peer, &peertab->buckets[bucket], - peertab_links) { - if (ipv6_addr_equal(&peer->addr, addr)) { - return peer; - } - INC_METRIC(peer_hash_links, 1); - } - - /* No existing entry; create a new one. - * - * Note: after we acquire the lock, we have to check again to - * make sure the entry still doesn't exist (it might have been - * created by a concurrent invocation of this function). - */ - spin_lock_bh(&peertab->write_lock); - hlist_for_each_entry_rcu(peer, &peertab->buckets[bucket], - peertab_links) { - if (ipv6_addr_equal(&peer->addr, addr)) - goto done; - } - peer = kmalloc(sizeof(*peer), GFP_ATOMIC); - if (!peer) { - peer = (struct homa_peer *) ERR_PTR(-ENOMEM); - INC_METRIC(peer_kmalloc_errors, 1); - goto done; - } - peer->addr = *addr; - dst = homa_peer_get_dst(peer, inet); - if (IS_ERR(dst)) { - kfree(peer); - peer = (struct homa_peer *) PTR_ERR(dst); - INC_METRIC(peer_route_errors, 1); - goto done; - } - peer->dst = dst; - peer->unsched_cutoffs[HOMA_MAX_PRIORITIES-1] = 0; - peer->unsched_cutoffs[HOMA_MAX_PRIORITIES-2] = INT_MAX; - peer->cutoff_version = 0; - peer->last_update_jiffies = 0; - INIT_LIST_HEAD(&peer->grantable_rpcs); - INIT_LIST_HEAD(&peer->grantable_links); - hlist_add_head_rcu(&peer->peertab_links, &peertab->buckets[bucket]); - peer->outstanding_resends = 0; - peer->most_recent_resend = 0; - peer->least_recent_rpc = NULL; - peer->least_recent_ticks = 0; - peer->current_ticks = -1; - peer->resend_rpc = NULL; - peer->num_acks = 0; - spin_lock_init(&peer->ack_lock); - INC_METRIC(peer_new_entries, 1); - - done: - spin_unlock_bh(&peertab->write_lock); - return peer; -} - -/** - * homa_dst_refresh() - This method is called when the dst for a peer is - * obsolete; it releases that dst and creates a new one. - * @peertab: Table containing the peer. - * @peer: Peer whose dst is obsolete. - * @hsk: Socket that will be used to transmit data to the peer. - */ -void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, - struct homa_sock *hsk) -{ - struct dst_entry *dst; - - spin_lock_bh(&peertab->write_lock); - dst = homa_peer_get_dst(peer, &hsk->inet); - if (IS_ERR(dst)) { - /* Retain the existing dst if we can't create a new one. */ - if (hsk->homa->verbose) - printk(KERN_NOTICE "homa_refresh couldn't recreate " - "dst: error %ld", PTR_ERR(dst)); - INC_METRIC(peer_route_errors, 1); - } else { - struct homa_dead_dst *dead = (struct homa_dead_dst *) - kmalloc(sizeof(*dead), GFP_KERNEL); - if (unlikely(!dead)) { - /* Can't allocate memory to keep track of the - * dead dst; just free it immediately (a bit - * risky, admittedly). - */ - dst_release(peer->dst); - } else { - __u64 now = get_cycles(); - - dead->dst = peer->dst; - dead->gc_time = now + (cpu_khz<<7); - list_add_tail(&dead->dst_links, &peertab->dead_dsts); - homa_peertab_gc_dsts(peertab, now); - } - peer->dst = dst; - } - spin_unlock_bh(&peertab->write_lock); -} - -/** - * homa_peer_unsched_priority() - Returns the priority level to use for - * unscheduled packets of a message. - * @homa: Overall data about the Homa protocol implementation. - * @peer: The destination of the message. - * @length: Number of bytes in the message. - * - * Return: A priority level. - */ -int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, - int length) -{ - int i; - for (i = homa->num_priorities-1; ; i--) { - if (peer->unsched_cutoffs[i] >= length) - return i; - } - /* Can't ever get here */ -} - -/** - * homa_peer_get_dst() - Find an appropriate dst structure (either IPv4 - * or IPv6) for a peer. - * @peer: The peer for which a dst is needed. Note: this peer's flow - * struct will be overwritten. - * @inet: Socket that will be used for sending packets. - * Return: The dst structure (or an ERR_PTR). - */ -struct dst_entry *homa_peer_get_dst(struct homa_peer *peer, - struct inet_sock *inet) -{ - memset(&peer->flow, 0, sizeof(peer->flow)); - if (inet->sk.sk_family == AF_INET) { - struct rtable *rt; - flowi4_init_output(&peer->flow.u.ip4, inet->sk.sk_bound_dev_if, - inet->sk.sk_mark, inet->tos, RT_SCOPE_UNIVERSE, - inet->sk.sk_protocol, 0, - peer->addr.in6_u.u6_addr32[3], inet->inet_saddr, - 0, 0, inet->sk.sk_uid); - security_sk_classify_flow(&inet->sk, &peer->flow.u.__fl_common); - rt = ip_route_output_flow(sock_net(&inet->sk), - &peer->flow.u.ip4, &inet->sk); - if (IS_ERR(rt)) - return (struct dst_entry *)(PTR_ERR(rt)); - return &rt->dst; - } else { - peer->flow.u.ip6.flowi6_oif = inet->sk.sk_bound_dev_if; - peer->flow.u.ip6.flowi6_iif = LOOPBACK_IFINDEX; - peer->flow.u.ip6.flowi6_mark = inet->sk.sk_mark; - peer->flow.u.ip6.flowi6_scope = RT_SCOPE_UNIVERSE; - peer->flow.u.ip6.flowi6_proto = inet->sk.sk_protocol; - peer->flow.u.ip6.flowi6_flags = 0; - peer->flow.u.ip6.flowi6_secid = 0; - peer->flow.u.ip6.flowi6_tun_key.tun_id = 0; - peer->flow.u.ip6.flowi6_uid = inet->sk.sk_uid; - peer->flow.u.ip6.daddr = peer->addr; - peer->flow.u.ip6.saddr = inet->pinet6->saddr; - peer->flow.u.ip6.fl6_dport = 0; - peer->flow.u.ip6.fl6_sport = 0; - peer->flow.u.ip6.mp_hash = 0; - peer->flow.u.ip6.__fl_common.flowic_tos = inet->tos; - peer->flow.u.ip6.flowlabel = ip6_make_flowinfo(inet->tos, 0); - security_sk_classify_flow(&inet->sk, &peer->flow.u.__fl_common); - return ip6_dst_lookup_flow(sock_net(&inet->sk), &inet->sk, - &peer->flow.u.ip6, NULL); - } -} - -/** - * homa_peer_set_cutoffs() - Set the cutoffs for unscheduled priorities in - * a peer object. This is a convenience function used primarily by unit tests. - * @peer: Homa_peer object whose cutoffs should be set. - * @c0: Largest message size that will use priority 0. - * @c1: Largest message size that will use priority 1. - * @c2: Largest message size that will use priority 2. - * @c3: Largest message size that will use priority 3. - * @c4: Largest message size that will use priority 4. - * @c5: Largest message size that will use priority 5. - * @c6: Largest message size that will use priority 6. - * @c7: Largest message size that will use priority 7. - */ -void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, int c2, - int c3, int c4, int c5, int c6, int c7) -{ - peer->unsched_cutoffs[0] = c0; - peer->unsched_cutoffs[1] = c1; - peer->unsched_cutoffs[2] = c2; - peer->unsched_cutoffs[3] = c3; - peer->unsched_cutoffs[4] = c4; - peer->unsched_cutoffs[5] = c5; - peer->unsched_cutoffs[6] = c6; - peer->unsched_cutoffs[7] = c7; -} - -/** - * homa_peer_lock_slow() - This function implements the slow path for - * acquiring a peer's @unacked_lock. It is invoked when the lock isn't - * immediately available. It waits for the lock, but also records statistics - * about the waiting time. - * @peer: Peer to lock. - */ -void homa_peer_lock_slow(struct homa_peer *peer) -{ - __u64 start = get_cycles(); - tt_record("beginning wait for peer lock"); - spin_lock_bh(&peer->ack_lock); - tt_record("ending wait for peer lock"); - INC_METRIC(peer_ack_lock_misses, 1); - INC_METRIC(peer_ack_lock_miss_cycles, get_cycles() - start); -} - -/** - * homa_peer_add_ack() - Add a given RPC to the list of unacked - * RPCs for its server. Once this method has been invoked, it's safe - * to delete the RPC, since it will eventually be acked to the server. - * @rpc: Client RPC that has now completed. - */ -void homa_peer_add_ack(struct homa_rpc *rpc) -{ - struct homa_peer *peer = rpc->peer; - struct ack_header ack; - - homa_peer_lock(peer); - if (peer->num_acks < NUM_PEER_UNACKED_IDS) { - peer->acks[peer->num_acks].client_id = cpu_to_be64(rpc->id); - peer->acks[peer->num_acks].client_port = htons(rpc->hsk->port); - peer->acks[peer->num_acks].server_port = htons(rpc->dport); - peer->num_acks++; - homa_peer_unlock(peer); - return; - } - - /* The peer has filled up; send an ACK message to empty it. The - * RPC in the message header will also be considered ACKed. - */ - INC_METRIC(ack_overflows, 1); - memcpy(ack.acks, peer->acks, sizeof(peer->acks)); - ack.num_acks = htons(peer->num_acks); - peer->num_acks = 0; - homa_peer_unlock(peer); - homa_xmit_control(ACK, &ack, sizeof(ack), rpc); -} - -/** - * homa_peer_get_acks() - Copy acks out of a peer, and remove them from the - * peer. - * @peer: Peer to check for possible unacked RPCs. - * @count: Maximum number of acks to return. - * @dst: The acks are copied to this location. - * - * Return: The number of acks extracted from the peer (<= count). - */ -int homa_peer_get_acks(struct homa_peer *peer, int count, struct homa_ack *dst) -{ - /* Don't waste time acquiring the lock if there are no ids available. */ - if (peer->num_acks == 0) - return 0; - - homa_peer_lock(peer); - - if (count > peer->num_acks) - count = peer->num_acks; - memcpy(dst, &peer->acks[peer->num_acks - count], - count * sizeof(peer->acks[0])); - peer->num_acks -= count; - - homa_peer_unlock(peer); - return count; -} diff --git a/homa_plumbing.c b/homa_plumbing.c index 8f3f668c..e8a4a020 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1,57 +1,38 @@ -/* Copyright (c) 2019-2023 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file consists mostly of "glue" that hooks Homa into the rest of * the Linux kernel. The guts of the protocol are in other files. */ #include "homa_impl.h" -#include "homa_lcache.h" - -#ifndef __UNIT_TEST__ -MODULE_LICENSE("Dual MIT/GPL"); -#endif -MODULE_AUTHOR("John Ousterhout"); -MODULE_DESCRIPTION("Homa transport protocol"); -MODULE_VERSION("0.01"); +#include "homa_peer.h" +#include "homa_pool.h" -/* Not yet sure what these variables are for */ -long sysctl_homa_mem[3] __read_mostly; -int sysctl_homa_rmem_min __read_mostly; -int sysctl_homa_wmem_min __read_mostly; -atomic_long_t homa_memory_allocated; +#ifndef __STRIP__ /* See strip.py */ +#include "homa_grant.h" +#include "homa_offload.h" +#include "homa_pacer.h" +#include "homa_qdisc.h" +#endif /* See strip.py */ -/* Global data for Homa. Never reference homa_data directory. Always use - * the homa variable instead; this allows overriding during unit tests. - */ -struct homa homa_data; -struct homa *homa = &homa_data; +/* Identifier for retrieving Homa-specific data for a struct net. */ +unsigned int homa_net_id; -/* True means that the Homa module is in the process of unloading itself, - * so everyone should clean up. +/* This structure defines functions that allow Homa to be used as a + * pernet subsystem. */ -static bool exiting = false; - -/* Thread that runs timer code to detect lost packets and crashed peers. */ -static struct task_struct *timer_kthread; +static struct pernet_operations homa_net_ops = { + .init = homa_net_start, + .exit = homa_net_exit, + .id = &homa_net_id, + .size = sizeof(struct homa_net) +}; -/* Set via sysctl to request that information on a particular topic - * be printed to the system log. The value written determines the - * topic. +/* Global data for Homa. Avoid referencing directly except when there is + * no alternative (instead, use a homa pointer stored in a struct or + * passed via a parameter). This allows overriding during unit tests. */ -static int log_topic; +static struct homa homa_data; /* This structure defines functions that handle various operations on * Homa sockets. These functions are relatively generic: they are called @@ -59,7 +40,7 @@ static int log_topic; * be implemented by PF_INET6 functions that are independent of the * Homa protocol. */ -const struct proto_ops homa_proto_ops = { +static const struct proto_ops homa_proto_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, @@ -69,7 +50,7 @@ const struct proto_ops homa_proto_ops = { .accept = sock_no_accept, .getname = inet_getname, .poll = homa_poll, - .ioctl = inet_ioctl, + .ioctl = homa_ioctl, .listen = sock_no_listen, .shutdown = homa_shutdown, .setsockopt = sock_common_setsockopt, @@ -77,11 +58,10 @@ const struct proto_ops homa_proto_ops = { .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, .set_peek_off = sk_set_peek_off, }; -const struct proto_ops homav6_proto_ops = { +static const struct proto_ops homav6_proto_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, @@ -91,7 +71,7 @@ const struct proto_ops homav6_proto_ops = { .accept = sock_no_accept, .getname = inet6_getname, .poll = homa_poll, - .ioctl = inet6_ioctl, + .ioctl = homa_ioctl, .listen = sock_no_listen, .shutdown = homa_shutdown, .setsockopt = sock_common_setsockopt, @@ -99,7 +79,6 @@ const struct proto_ops homav6_proto_ops = { .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, .set_peek_off = sk_set_peek_off, }; @@ -109,70 +88,43 @@ const struct proto_ops homav6_proto_ops = { * protocol family, and in many cases they are invoked by functions in * homa_proto_ops. Most of these functions have Homa-specific implementations. */ -struct proto homa_prot = { +static struct proto homa_prot = { .name = "HOMA", .owner = THIS_MODULE, .close = homa_close, .connect = ip4_datagram_connect, - .disconnect = homa_disconnect, - .ioctl = homa_ioctl, .init = homa_socket, - .destroy = 0, + .destroy = homa_sock_destroy, .setsockopt = homa_setsockopt, .getsockopt = homa_getsockopt, .sendmsg = homa_sendmsg, .recvmsg = homa_recvmsg, - .sendpage = homa_sendpage, - .backlog_rcv = homa_backlog_rcv, - .release_cb = ip4_datagram_release_cb, .hash = homa_hash, .unhash = homa_unhash, - .rehash = homa_rehash, - .get_port = homa_get_port, - .memory_allocated = &homa_memory_allocated, - .sysctl_mem = sysctl_homa_mem, - .sysctl_wmem = &sysctl_homa_wmem_min, - .sysctl_rmem = &sysctl_homa_rmem_min, .obj_size = sizeof(struct homa_sock), - .diag_destroy = homa_diag_destroy, .no_autobind = 1, }; -struct proto homav6_prot = { +static struct proto homav6_prot = { .name = "HOMAv6", .owner = THIS_MODULE, .close = homa_close, .connect = ip6_datagram_connect, - .disconnect = homa_disconnect, - .ioctl = homa_ioctl, .init = homa_socket, - .destroy = 0, + .destroy = homa_sock_destroy, .setsockopt = homa_setsockopt, .getsockopt = homa_getsockopt, .sendmsg = homa_sendmsg, .recvmsg = homa_recvmsg, - .sendpage = homa_sendpage, - .backlog_rcv = homa_backlog_rcv, - .release_cb = ip6_datagram_release_cb, .hash = homa_hash, .unhash = homa_unhash, - .rehash = homa_rehash, - .get_port = homa_get_port, - .memory_allocated = &homa_memory_allocated, - .sysctl_mem = sysctl_homa_mem, - .sysctl_wmem = &sysctl_homa_wmem_min, - .sysctl_rmem = &sysctl_homa_rmem_min, - - /* IPv6 data comes *after* Homa's data, and isn't included in - * struct homa_sock. - */ - .obj_size = sizeof(struct homa_sock) + sizeof(struct ipv6_pinfo), - .diag_destroy = homa_diag_destroy, + .obj_size = sizeof(struct homa_v6_sock), + .ipv6_pinfo_offset = offsetof(struct homa_v6_sock, inet6), .no_autobind = 1, }; /* Top-level structure describing the Homa protocol. */ -struct inet_protosw homa_protosw = { +static struct inet_protosw homa_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_HOMA, .prot = &homa_prot, @@ -180,7 +132,7 @@ struct inet_protosw homa_protosw = { .flags = INET_PROTOSW_REUSE, }; -struct inet_protosw homav6_protosw = { +static struct inet_protosw homav6_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_HOMA, .prot = &homav6_prot, @@ -198,460 +150,636 @@ static struct net_protocol homa_protocol = { static struct inet6_protocol homav6_protocol = { .handler = homa_softirq, .err_handler = homa_err_handler_v6, - .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, -}; - -/* Describes file operations implemented for /proc/net/homa_metrics. */ -static const struct proc_ops homa_metrics_pops = { - .proc_open = homa_metrics_open, - .proc_read = homa_metrics_read, - .proc_lseek = homa_metrics_lseek, - .proc_release = homa_metrics_release, + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, }; -/* Used to remove /proc/net/homa_metrics when the module is unloaded. */ -static struct proc_dir_entry *metrics_dir_entry = NULL; - -/* Used to configure sysctl access to Homa configuration parameters.*/ +#ifndef __STRIP__ /* See strip.py */ +/* Used to configure sysctl access to Homa configuration parameters. The + * @data fields are actually offsets within a struct homa; these are converted + * to pointers into a net-specific struct homa later. + */ +#define OFFSET(field) ((void *)offsetof(struct homa, field)) static struct ctl_table homa_ctl_table[] = { { - .procname = "bpage_lease_usecs", - .data = &homa_data.bpage_lease_usecs, + .procname = "accept_bits", + .data = OFFSET(accept_bits), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { - .procname = "cutoff_version", - .data = &homa_data.cutoff_version, + .procname = "action", + .data = OFFSET(sysctl_action), .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = proc_dointvec + .mode = 0644, + .proc_handler = homa_dointvec }, { - .procname = "dead_buffs_limit", - .data = &homa_data.dead_buffs_limit, + .procname = "bpage_lease_usecs", + .data = OFFSET(bpage_lease_usecs), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { - .procname = "duty_cycle", - .data = &homa_data.duty_cycle, + .procname = "busy_usecs", + .data = OFFSET(busy_usecs), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { - .procname = "flags", - .data = &homa_data.flags, + .procname = "cutoff_version", + .data = OFFSET(cutoff_version), .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec + .mode = 0444, + .proc_handler = homa_dointvec }, { - .procname = "freeze_type", - .data = &homa_data.freeze_type, + .procname = "dead_buffs_limit", + .data = OFFSET(dead_buffs_limit), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { - .procname = "grant_fifo_fraction", - .data = &homa_data.grant_fifo_fraction, + .procname = "drop_bits", + .data = OFFSET(drop_bits), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { - .procname = "fifo_grant_increment", - .data = &homa_data.fifo_grant_increment, + .procname = "flags", + .data = OFFSET(flags), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { - .procname = "gro_busy_us", - .data = &homa_data.gro_busy_usecs, + .procname = "freeze_type", + .data = OFFSET(freeze_type), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { - .procname = "gro_policy", - .data = &homa_data.gro_policy, - .maxlen = sizeof(int), + .procname = "gen3_softirq_cores", + .data = NULL, + .maxlen = 0, .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_sysctl_softirq_cores }, { - .procname = "gso_force_software", - .data = &homa_data.gso_force_software, + .procname = "gro_busy_usecs", + .data = OFFSET(gro_busy_usecs), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { - .procname = "link_mbps", - .data = &homa_data.link_mbps, + .procname = "gro_policy", + .data = OFFSET(gro_policy), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { - .procname = "log_topic", - .data = &log_topic, + .procname = "gso_force_software", + .data = OFFSET(gso_force_software), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { - .procname = "pacer_fifo_fraction", - .data = &homa_data.pacer_fifo_fraction, + .procname = "hijack_tcp", + .data = OFFSET(hijack_tcp), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { - .procname = "max_dead_buffs", - .data = &homa_data.max_dead_buffs, + .procname = "link_mbps", + .data = OFFSET(link_mbps), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { - .procname = "max_grant_window", - .data = &homa_data.max_grant_window, + .procname = "max_dead_buffs", + .data = OFFSET(max_dead_buffs), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "max_gro_skbs", - .data = &homa_data.max_gro_skbs, + .data = OFFSET(max_gro_skbs), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "max_gso_size", - .data = &homa_data.max_gso_size, + .data = OFFSET(max_gso_size), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { - .procname = "max_nic_queue_ns", - .data = &homa_data.max_nic_queue_ns, + .procname = "max_sched_prio", + .data = OFFSET(max_sched_prio), .maxlen = sizeof(int), - .mode = 0644, + .mode = 0444, .proc_handler = homa_dointvec }, { - .procname = "max_overcommit", - .data = &homa_data.max_overcommit, + .procname = "next_id", + .data = OFFSET(next_id), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, - { - .procname = "max_sched_prio", - .data = &homa_data.max_sched_prio, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = proc_dointvec - }, { .procname = "num_priorities", - .data = &homa_data.num_priorities, + .data = OFFSET(num_priorities), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "poll_usecs", - .data = &homa_data.poll_usecs, + .data = OFFSET(poll_usecs), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "priority_map", - .data = &homa_data.priority_map, - .maxlen = HOMA_MAX_PRIORITIES*sizeof(int), + .data = OFFSET(priority_map), + .maxlen = HOMA_MAX_PRIORITIES * sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "reap_limit", - .data = &homa_data.reap_limit, + .data = OFFSET(reap_limit), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "request_ack_ticks", - .data = &homa_data.request_ack_ticks, + .data = OFFSET(request_ack_ticks), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "resend_interval", - .data = &homa_data.resend_interval, + .data = OFFSET(resend_interval), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "resend_ticks", - .data = &homa_data.resend_ticks, + .data = OFFSET(resend_ticks), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { - .procname = "rtt_bytes", - .data = &homa_data.rtt_bytes, + .procname = "skb_page_frees_per_sec", + .data = OFFSET(skb_page_frees_per_sec), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { - .procname = "sync_freeze", - .data = &homa_data.sync_freeze, + .procname = "skb_page_pool_min_kb", + .data = OFFSET(skb_page_pool_min_kb), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "temp", - .data = homa_data.temp, - .maxlen = sizeof(homa_data.temp), + .data = OFFSET(temp[0]), + .maxlen = sizeof(((struct homa *)0)->temp), .mode = 0644, .proc_handler = homa_dointvec }, { - .procname = "throttle_min_bytes", - .data = &homa_data.throttle_min_bytes, + .procname = "timeout_resends", + .data = OFFSET(timeout_resends), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { - .procname = "timeout_resends", - .data = &homa_data.timeout_resends, + .procname = "timeout_ticks", + .data = OFFSET(timeout_ticks), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec + }, + { + .procname = "unsched_bytes", + .data = OFFSET(unsched_bytes), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_dointvec }, { .procname = "unsched_cutoffs", - .data = &homa_data.unsched_cutoffs, - .maxlen = HOMA_MAX_PRIORITIES*sizeof(int), + .data = OFFSET(unsched_cutoffs), + .maxlen = HOMA_MAX_PRIORITIES * sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "verbose", - .data = &homa_data.verbose, + .data = OFFSET(verbose), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_dointvec + }, + { + .procname = "wmem_max", + .data = OFFSET(wmem_max), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, - {} }; +#endif /* See strip.py */ /* Sizes of the headers for each Homa packet type, in bytes. */ -static __u16 header_lengths[] = { - sizeof32(struct data_header), - sizeof32(struct grant_header), - sizeof32(struct resend_header), - sizeof32(struct unknown_header), - sizeof32(struct busy_header), - sizeof32(struct cutoffs_header), - sizeof32(struct freeze_header), - sizeof32(struct need_ack_header), - sizeof32(struct ack_header) +#ifndef __STRIP__ /* See strip.py */ +static u16 header_lengths[] = { + sizeof(struct homa_data_hdr), + sizeof(struct homa_grant_hdr), + sizeof(struct homa_resend_hdr), + sizeof(struct homa_rpc_unknown_hdr), + sizeof(struct homa_busy_hdr), + sizeof(struct homa_cutoffs_hdr), + sizeof(struct homa_freeze_hdr), + sizeof(struct homa_need_ack_hdr), + sizeof(struct homa_ack_hdr) }; +#else /* See strip.py */ +static u16 header_lengths[] = { + sizeof(struct homa_data_hdr), + 0, + sizeof(struct homa_resend_hdr), + sizeof(struct homa_rpc_unknown_hdr), + sizeof(struct homa_busy_hdr), + 0, + 0, + sizeof(struct homa_need_ack_hdr), + sizeof(struct homa_ack_hdr) +}; +#endif /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ /* Used to remove sysctl values when the module is unloaded. */ static struct ctl_table_header *homa_ctl_header; +#endif /* See strip.py */ +/* Thread that runs timer code to detect lost packets and crashed peers. */ +static struct task_struct *timer_kthread; static DECLARE_COMPLETION(timer_thread_done); +/* Used to wakeup timer_kthread at regular intervals. */ +static struct hrtimer hrtimer; + +/* Nonzero is an indication to the timer thread that it should exit. */ +static int timer_thread_exit; + /** * homa_load() - invoked when this module is loaded into the Linux kernel * Return: 0 on success, otherwise a negative errno. */ -static int __init homa_load(void) { +int __init homa_load(void) +{ + struct homa *homa = &homa_data; + bool init_protocol6 = false; + bool init_protosw6 = false; + bool init_protocol = false; + bool init_protosw = false; + bool init_net_ops = false; + bool init_proto6 = false; + bool init_proto = false; + bool init_homa = false; int status; - printk(KERN_NOTICE "Homa module loading\n"); - printk(KERN_NOTICE "Homa structure sizes: data_header %u, " - "data_segment %u, ack %u, " - "grant_header %u, peer %u, ip_hdr %u flowi %u " - "ipv6_hdr %u, flowi6 %u " - "tcp_sock %u homa_rpc %u sk_buff %u " - "rcvmsg_control %u sockaddr_in_union %u " - "HOMA_MAX_BPAGES %u NR_CPUS %u " - "nr_cpu_ids %u\n", - sizeof32(struct data_header), - sizeof32(struct data_segment), - sizeof32(struct homa_ack), - sizeof32(struct grant_header), - sizeof32(struct homa_peer), - sizeof32(struct iphdr), - sizeof32(struct flowi), - sizeof32(struct ipv6hdr), - sizeof32(struct flowi6), - sizeof32(struct tcp_sock), - sizeof32(struct homa_rpc), - sizeof32(struct sk_buff), - sizeof32(struct homa_recvmsg_args), - sizeof32(sockaddr_in_union), - HOMA_MAX_BPAGES, - NR_CPUS, - nr_cpu_ids); + IF_NO_STRIP(bool init_metrics = false); + IF_NO_STRIP(bool init_offload = false); + IF_NO_STRIP(bool init_sysctl = false); + IF_NO_STRIP(bool init_qdisc = false); + + /* Compile-time validations that no packet header is longer + * than HOMA_MAX_HEADER. + */ + BUILD_BUG_ON(sizeof(struct homa_data_hdr) > HOMA_MAX_HEADER); +#ifndef __STRIP__ /* See strip.py */ + BUILD_BUG_ON(sizeof(struct homa_grant_hdr) > HOMA_MAX_HEADER); +#endif /* See strip.py */ + BUILD_BUG_ON(sizeof(struct homa_resend_hdr) > HOMA_MAX_HEADER); + BUILD_BUG_ON(sizeof(struct homa_rpc_unknown_hdr) > HOMA_MAX_HEADER); + BUILD_BUG_ON(sizeof(struct homa_busy_hdr) > HOMA_MAX_HEADER); +#ifndef __STRIP__ /* See strip.py */ + BUILD_BUG_ON(sizeof(struct homa_cutoffs_hdr) > HOMA_MAX_HEADER); +#endif /* See strip.py */ +#ifndef __UPSTREAM__ /* See strip.py */ + BUILD_BUG_ON(sizeof(struct homa_freeze_hdr) > HOMA_MAX_HEADER); +#endif /* See strip.py */ + BUILD_BUG_ON(sizeof(struct homa_need_ack_hdr) > HOMA_MAX_HEADER); + BUILD_BUG_ON(sizeof(struct homa_ack_hdr) > HOMA_MAX_HEADER); + + /* Extra constraints on data packets: + * - Ensure minimum header length so Homa doesn't have to worry about + * padding data packets. + * - Make sure data packet headers are a multiple of 4 bytes (needed + * for TCP/TSO compatibility). + */ + BUILD_BUG_ON(sizeof(struct homa_data_hdr) < HOMA_MIN_PKT_LENGTH); + BUILD_BUG_ON((sizeof(struct homa_data_hdr) - + sizeof(struct homa_seg_hdr)) & 0x3); + +#ifndef __STRIP__ /* See strip.py */ + /* Homa requires at least 8 priority levels. */ + BUILD_BUG_ON(HOMA_MAX_PRIORITIES < 8); +#endif /* See strip.py */ + + /* Detect size changes in uAPI structs. */ + BUILD_BUG_ON(sizeof(struct homa_sendmsg_args) != 24); + BUILD_BUG_ON(sizeof(struct homa_recvmsg_args) != 88); +#ifndef __STRIP__ /* See strip.py */ + BUILD_BUG_ON(sizeof(struct homa_abort_args) != 32); +#endif /* See strip.py */ + +#ifndef __UPSTREAM__ /* See strip.py */ + pr_err("Homa module loading\n"); + pr_notice("Homa structure sizes: homa_data_hdr %lu, homa_seg_hdr %lu, homa_grant_hdr %lu, ack %lu, peer %lu, ip_hdr %lu flowi %lu ipv6_hdr %lu, flowi6 %lu tcp_sock %lu homa_rpc %lu sk_buff %lu skb_shared_info %lu rcvmsg_control %lu union sockaddr_in_union %lu HOMA_MAX_BPAGES %u NR_CPUS %u nr_cpu_ids %u, MAX_NUMNODES %d\n", + sizeof(struct homa_data_hdr), + sizeof(struct homa_seg_hdr), + sizeof(struct homa_grant_hdr), + sizeof(struct homa_ack), + sizeof(struct homa_peer), + sizeof(struct iphdr), + sizeof(struct flowi), + sizeof(struct ipv6hdr), + sizeof(struct flowi6), + sizeof(struct tcp_sock), + sizeof(struct homa_rpc), + sizeof(struct sk_buff), + sizeof(struct skb_shared_info), + sizeof(struct homa_recvmsg_args), + sizeof(union sockaddr_in_union), + HOMA_MAX_BPAGES, + NR_CPUS, + nr_cpu_ids, + MAX_NUMNODES); +#endif /* See strip.py */ + +#ifndef __UPSTREAM__ /* See strip.py */ + tt_init("timetrace"); +#endif /* See strip.py */ + + status = homa_init(homa); + if (status) + goto error; + init_homa = true; + status = proto_register(&homa_prot, 1); if (status != 0) { - printk(KERN_ERR "proto_register failed for homa_prot: %d\n", - status); - goto out; + pr_err("proto_register failed for homa_prot: %d\n", status); + goto error; } + init_proto = true; + status = proto_register(&homav6_prot, 1); if (status != 0) { - printk(KERN_ERR "proto_register failed for homav6_prot: %d\n", - status); - goto out; + pr_err("proto_register failed for homav6_prot: %d\n", status); + goto error; } + init_proto6 = true; + inet_register_protosw(&homa_protosw); - inet6_register_protosw(&homav6_protosw); + init_protosw = true; + + status = inet6_register_protosw(&homav6_protosw); + if (status != 0) { + pr_err("inet6_register_protosw failed in %s: %d\n", __func__, + status); + goto error; + } + init_protosw6 = true; + status = inet_add_protocol(&homa_protocol, IPPROTO_HOMA); if (status != 0) { - printk(KERN_ERR "inet_add_protocol failed in homa_load: %d\n", - status); - goto out_cleanup; + pr_err("inet_add_protocol failed in %s: %d\n", __func__, + status); + goto error; } + init_protocol = true; + status = inet6_add_protocol(&homav6_protocol, IPPROTO_HOMA); if (status != 0) { - printk(KERN_ERR "inet6_add_protocol failed in homa_load: %d\n", - status); - goto out_cleanup; + pr_err("inet6_add_protocol failed in %s: %d\n", __func__, + status); + goto error; } + init_protocol6 = true; - status = homa_init(homa); - if (status) - goto out_cleanup; - metrics_dir_entry = proc_create("homa_metrics", S_IRUGO, - init_net.proc_net, &homa_metrics_pops); - if (!metrics_dir_entry) { - printk(KERN_ERR "couldn't create /proc/net/homa_metrics\n"); - status = -ENOMEM; - goto out_cleanup; - } +#ifndef __STRIP__ /* See strip.py */ + status = homa_metrics_init(); + if (status != 0) + goto error; + init_metrics = true; homa_ctl_header = register_net_sysctl(&init_net, "net/homa", - homa_ctl_table); + homa_ctl_table); if (!homa_ctl_header) { - printk(KERN_ERR "couldn't register Homa sysctl parameters\n"); + pr_err("couldn't register Homa sysctl parameters\n"); status = -ENOMEM; - goto out_cleanup; + goto error; } + init_sysctl = true; status = homa_offload_init(); if (status != 0) { - printk(KERN_ERR "Homa couldn't init offloads\n"); - goto out_cleanup; + pr_err("Homa couldn't init offloads\n"); + goto error; + } + init_offload = true; + + status = homa_qdisc_register(); + if (status != 0) { + pr_err("Homa couldn't load its qdisc: error %d\n", status); + goto error; } + init_qdisc = true; +#endif /* See strip.py */ + + status = register_pernet_subsys(&homa_net_ops); + if (status != 0) { + pr_err("Homa got error from register_pernet_subsys: %d\n", + status); + goto error; + } + init_net_ops = true; timer_kthread = kthread_run(homa_timer_main, homa, "homa_timer"); if (IS_ERR(timer_kthread)) { status = PTR_ERR(timer_kthread); - printk(KERN_ERR "couldn't create homa pacer thread: error %d\n", - status); + pr_err("couldn't create Homa timer thread: error %d\n", + status); timer_kthread = NULL; - goto out_cleanup; + goto error; } - tt_init("timetrace", homa->temp); +#ifndef __STRIP__ /* See strip.py */ + homa_gro_hook_tcp(); +#endif /* See strip.py */ +#ifndef __UPSTREAM__ /* See strip.py */ + tt_set_temp(homa->temp); +#endif /* See strip.py */ return 0; -out_cleanup: - homa_offload_end(); - unregister_net_sysctl_table(homa_ctl_header); - proc_remove(metrics_dir_entry); - homa_destroy(homa); - inet_del_protocol(&homa_protocol, IPPROTO_HOMA); - inet_unregister_protosw(&homa_protosw); - inet6_del_protocol(&homav6_protocol, IPPROTO_HOMA); - inet6_unregister_protosw(&homav6_protosw); - proto_unregister(&homa_prot); - proto_unregister(&homav6_prot); -out: +error: + if (timer_kthread) { + timer_thread_exit = 1; + wake_up_process(timer_kthread); + wait_for_completion(&timer_thread_done); + } +#ifndef __STRIP__ /* See strip.py */ + if (init_qdisc) + homa_qdisc_unregister(); + if (init_offload) + homa_offload_end(); + if (init_sysctl) + unregister_net_sysctl_table(homa_ctl_header); + if (init_metrics) + homa_metrics_end(); +#endif /* See strip.py */ + if (init_net_ops) + unregister_pernet_subsys(&homa_net_ops); + if (init_homa) + homa_destroy(homa); + if (init_protocol) + inet_del_protocol(&homa_protocol, IPPROTO_HOMA); + if (init_protocol6) + inet6_del_protocol(&homav6_protocol, IPPROTO_HOMA); + if (init_protosw) + inet_unregister_protosw(&homa_protosw); + if (init_protosw6) + inet6_unregister_protosw(&homav6_protosw); + if (init_proto) + proto_unregister(&homa_prot); + if (init_proto6) + proto_unregister(&homav6_prot); return status; } /** * homa_unload() - invoked when this module is unloaded from the Linux kernel. */ -static void __exit homa_unload(void) { - printk(KERN_NOTICE "Homa module unloading\n"); - exiting = true; +void __exit homa_unload(void) +{ + struct homa *homa = &homa_data; - tt_destroy(); + pr_notice("Homa module unloading\n"); - if (timer_kthread) +#ifndef __STRIP__ /* See strip.py */ + homa_gro_unhook_tcp(); + if (timer_kthread) { + timer_thread_exit = 1; wake_up_process(timer_kthread); + wait_for_completion(&timer_thread_done); + } + homa_qdisc_unregister(); if (homa_offload_end() != 0) - printk(KERN_ERR "Homa couldn't stop offloads\n"); - wait_for_completion(&timer_thread_done); + pr_err("Homa couldn't stop offloads\n"); unregister_net_sysctl_table(homa_ctl_header); - proc_remove(metrics_dir_entry); - homa_destroy(homa); + homa_metrics_end(); +#endif /* See strip.py */ + unregister_pernet_subsys(&homa_net_ops); inet_del_protocol(&homa_protocol, IPPROTO_HOMA); inet_unregister_protosw(&homa_protosw); inet6_del_protocol(&homav6_protocol, IPPROTO_HOMA); inet6_unregister_protosw(&homav6_protosw); proto_unregister(&homa_prot); proto_unregister(&homav6_prot); + homa_destroy(homa); +#ifndef __UPSTREAM__ /* See strip.py */ + tt_destroy(); +#endif /* See strip.py */ } module_init(homa_load); module_exit(homa_unload); +/** + * homa_net_start() - Initialize Homa for a new network namespace. + * @net: The net that Homa will be associated with. + * Return: 0 on success, otherwise a negative errno. + */ +int homa_net_start(struct net *net) +{ + pr_notice("Homa attaching to net namespace\n"); + return homa_net_init(homa_net(net), net, &homa_data); +} + +/** + * homa_net_exit() - Perform Homa cleanup needed when a network namespace + * is destroyed. + * @net: The net from which Homa should be removed. + */ +void homa_net_exit(struct net *net) +{ + pr_notice("Homa detaching from net namespace\n"); + homa_net_destroy(homa_net(net)); +} + /** * homa_bind() - Implements the bind system call for Homa sockets: associates * a well-known service port with a socket. Unlike other AF_INET6 protocols, * there is no need to invoke this system call for sockets that are only * used as clients. * @sock: Socket on which the system call was invoked. - * @addr: Contains the desired port number. + * @addr: Contains the desired port number. * @addr_len: Number of bytes in uaddr. - * Return: 0 on success, otherwise a negative errno. + * Return: 0 on success, otherwise a negative errno. Sets hsk->error_msg + * on errors. */ int homa_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { + union sockaddr_in_union *addr_in = (union sockaddr_in_union *)addr; struct homa_sock *hsk = homa_sk(sock->sk); - sockaddr_in_union *addr_in = (sockaddr_in_union *) addr; - int port; + int port = 0; if (unlikely(addr->sa_family != sock->sk->sk_family)) { + hsk->error_msg = "address family in bind address didn't match socket"; return -EAFNOSUPPORT; } if (addr_in->in6.sin6_family == AF_INET6) { if (addr_len < sizeof(struct sockaddr_in6)) { + hsk->error_msg = "ipv6 address too short"; return -EINVAL; } port = ntohs(addr_in->in4.sin_port); } else if (addr_in->in4.sin_family == AF_INET) { if (addr_len < sizeof(struct sockaddr_in)) { + hsk->error_msg = "ipv4 address too short"; return -EINVAL; } port = ntohs(addr_in->in6.sin6_port); } - return homa_sock_bind(&homa->port_map, hsk, port); + return homa_sock_bind(hsk->hnet, hsk, port); } /** @@ -659,18 +787,21 @@ int homa_bind(struct socket *sock, struct sockaddr *addr, int addr_len) * @sk: Socket being closed * @timeout: ?? */ -void homa_close(struct sock *sk, long timeout) { +void homa_close(struct sock *sk, long timeout) +{ struct homa_sock *hsk = homa_sk(sk); - homa_sock_destroy(hsk); +#ifndef __UPSTREAM__ /* See strip.py */ + int port = hsk->port; +#endif /* See strip.py */ + + homa_sock_shutdown(hsk); sk_common_release(sk); - tt_record1("closed socket, port %d\n", hsk->port); - if (hsk->homa->freeze_type == SOCKET_CLOSE) - tt_freeze(); + tt_record1("closed socket, port %d", port); } /** * homa_shutdown() - Implements the shutdown system call for Homa sockets. - * @sk: Socket to shut down. + * @sock: Socket to shut down. * @how: Ignored: for other sockets, can independently shut down * sending and receiving, but for Homa any shutdown will * shut down everything. @@ -683,37 +814,30 @@ int homa_shutdown(struct socket *sock, int how) return 0; } -/** - * homa_disconnect() - Invoked when disconnect system call is invoked on a - * Homa socket. - * @sk: Socket to disconnect - * @flags: ?? - * - * Return: 0 on success, otherwise a negative errno. - */ -int homa_disconnect(struct sock *sk, int flags) { - printk(KERN_WARNING "unimplemented disconnect invoked on Homa socket\n"); - return -ENOSYS; -} - +#ifndef __STRIP__ /* See strip.py */ /** * homa_ioc_abort() - The top-level function for the ioctl that implements * the homa_abort user-level API. - * @sk: Socket for this request. - * @arg: Used to pass information from user space. + * @sock: Socket for this request. + * @arg: User-space address of a homa_abort_args struct. * - * Return: 0 on success, otherwise a negative errno. + * Return: 0 on success, otherwise a negative errno. Sets hsk->error_msg + * on errors. */ -int homa_ioc_abort(struct sock *sk, unsigned long arg) { - int ret = 0; - struct homa_sock *hsk = homa_sk(sk); +int homa_ioc_abort(struct socket *sock, unsigned long arg) +{ + struct homa_sock *hsk = homa_sk(sock->sk); struct homa_abort_args args; struct homa_rpc *rpc; + int ret = 0; - if (unlikely(copy_from_user(&args, (void *) arg, sizeof(args)))) + if (unlikely(copy_from_user(&args, (void __user *)arg, sizeof(args)))) { + hsk->error_msg = "invalid address for homa_abort_args"; return -EFAULT; + } if (args._pad1 || args._pad2[0] || args._pad2[1]) { + hsk->error_msg = "reserved fields in homa_abort_args must be zero"; return -EINVAL; } if (args.id == 0) { @@ -721,49 +845,122 @@ int homa_ioc_abort(struct sock *sk, unsigned long arg) { return 0; } - rpc = homa_find_client_rpc(hsk, args.id); - if (rpc == NULL) + rpc = homa_rpc_find_client(hsk, args.id); + if (!rpc) { + hsk->error_msg = "RPC identifier did not match any existing RPC"; return -EINVAL; - if (args.error == 0) { - homa_rpc_free(rpc); - } else { - homa_rpc_abort(rpc, -args.error); } - homa_rpc_unlock(rpc); + if (args.error == 0) + homa_rpc_end(rpc); + else + homa_rpc_abort(rpc, -args.error); + homa_rpc_unlock(rpc); /* Locked by homa_rpc_find_client. */ return ret; } +#endif /* See strip.py */ + +/** + * homa_ioc_info() - The top-level function that implements the + * HOMAIOCINFO ioctl for Homa sockets. + * @sock: Socket for this request + * @arg: The address in user space of the argument to ioctl, which + * is a homa_info struct. + * + * Return: 0 on success, otherwise a negative errno. Sets hsk->error_msg + * on errors. + */ +int homa_ioc_info(struct socket *sock, unsigned long arg) +{ + struct homa_sock *hsk = homa_sk(sock->sk); + struct homa_rpc_info rinfo; + struct homa_info hinfo; + struct homa_rpc *rpc; + int bytes_avl; + char *dst; + + if (unlikely(copy_from_user(&hinfo, (void __user *)arg, + sizeof(hinfo)))) { + hsk->error_msg = "invalid address for homa_info"; + return -EFAULT; + } + + if (!homa_protect_rpcs(hsk)) { + hsk->error_msg = "socket has been shut down"; + return -ESHUTDOWN; + } + hinfo.bpool_avail_bytes = homa_pool_avail_bytes(hsk->buffer_pool); + hinfo.port = hsk->port; + dst = (char *)hinfo.rpc_info; + bytes_avl = hinfo.rpc_info_length; + hinfo.num_rpcs = 0; + list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { + homa_rpc_lock(rpc); + if (rpc->state == RPC_DEAD) { + homa_rpc_unlock(rpc); + continue; + } + homa_rpc_get_info(rpc, &rinfo); + homa_rpc_unlock(rpc); + if (dst && bytes_avl >= sizeof(rinfo)) { + if (copy_to_user((void __user *)dst, &rinfo, + sizeof(rinfo))) { + homa_unprotect_rpcs(hsk); + hsk->error_msg = "couldn't copy homa_rpc_info to user space: invalid or read-only address?"; + return -EFAULT; + } + dst += sizeof(rinfo); + bytes_avl -= sizeof(rinfo); + } + hinfo.num_rpcs++; + } + homa_unprotect_rpcs(hsk); + + if (hsk->error_msg) + snprintf(hinfo.error_msg, HOMA_ERROR_MSG_SIZE, "%s", + hsk->error_msg); + else + hinfo.error_msg[0] = 0; + + if (copy_to_user((void __user *)arg, &hinfo, sizeof(hinfo))) { + hsk->error_msg = "couldn't copy homa_info to user space: read-only address?"; + return -EFAULT; + } + return 0; +} /** * homa_ioctl() - Implements the ioctl system call for Homa sockets. - * @sk: Socket on which the system call was invoked. + * @sock: Socket on which the system call was invoked. * @cmd: Identifier for a particular ioctl operation. * @arg: Operation-specific argument; typically the address of a block * of data in user address space. * - * Return: 0 on success, otherwise a negative errno. + * Return: 0 on success, otherwise a negative errno. Sets hsk->error_msg + * on errors. */ -int homa_ioctl(struct sock *sk, int cmd, unsigned long arg) { - int result; - __u64 start = get_cycles(); +int homa_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ +#ifndef __STRIP__ /* See strip.py */ + if (cmd == HOMAIOCABORT) { + u64 start = homa_clock(); + int result; - switch (cmd) { - case HOMAIOCABORT: - result = homa_ioc_abort(sk, arg); + result = homa_ioc_abort(sock, arg); INC_METRIC(abort_calls, 1); - INC_METRIC(abort_cycles, get_cycles() - start); - break; - case HOMAIOCFREEZE: - tt_record1("Freezing timetrace because of HOMAIOCFREEZE ioctl, " - "pid %d", current->pid); + INC_METRIC(abort_cycles, homa_clock() - start); + return result; + } + if (cmd == HOMAIOCFREEZE) { + tt_record1("Freezing timetrace because of HOMAIOCFREEZE ioctl, pid %d", + current->pid); tt_freeze(); - result = 0; - break; - default: - printk(KERN_NOTICE "Unknown Homa ioctl: %d\n", cmd); - result = -EINVAL; - break; + return 0; } - return result; +#endif /* See strip.py */ + if (cmd == HOMAIOCINFO) + return homa_ioc_info(sock, arg); + homa_sk(sock->sk)->error_msg = "ioctl opcode isn't supported by Homa"; + return -EINVAL; } /** @@ -776,8 +973,14 @@ int homa_ioctl(struct sock *sk, int cmd, unsigned long arg) { int homa_socket(struct sock *sk) { struct homa_sock *hsk = homa_sk(sk); - homa_sock_init(hsk, homa); - return 0; + int result; + + result = homa_sock_init(hsk); + if (result != 0) { + homa_sock_shutdown(hsk); + homa_sock_destroy(&hsk->sock); + } + return result; } /** @@ -788,175 +991,314 @@ int homa_socket(struct sock *sk) * @optname: Identifies a particular setsockopt operation. * @optval: Address in user space of information about the option. * @optlen: Number of bytes of data at @optval. - * Return: 0 on success, otherwise a negative errno. + * Return: 0 on success, otherwise a negative errno. Sets hsk->error_msg + * on errors. */ -int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, - unsigned int optlen) +int homa_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct homa_sock *hsk = homa_sk(sk); - struct homa_set_buf_args args; - __u64 start = get_cycles(); int ret; - if ((level != IPPROTO_HOMA) || (optname != SO_HOMA_SET_BUF) - || (optlen != sizeof(struct homa_set_buf_args))) - return -EINVAL; + if (level != IPPROTO_HOMA) { + hsk->error_msg = "homa_setsockopt invoked with level not IPPROTO_HOMA"; + return -ENOPROTOOPT; + } - if (copy_from_sockptr(&args, optval, optlen)) - return -EFAULT; + if (optname == SO_HOMA_RCVBUF) { + struct homa_rcvbuf_args args; +#ifndef __STRIP__ /* See strip.py */ + u64 start = homa_clock(); +#endif /* See strip.py */ - /* Do a trivial test to make sure we can at least write the first - * page of the region. - */ - if (copy_to_user(args.start, &args, sizeof(args))) - return -EFAULT; + if (optlen != sizeof(struct homa_rcvbuf_args)) { + hsk->error_msg = "invalid optlen argument: must be sizeof(struct homa_rcvbuf_args)"; + return -EINVAL; + } - homa_sock_lock(hsk, "homa_setsockopt SO_HOMA_SET_BUF"); - ret = homa_pool_init(&hsk->buffer_pool, hsk->homa, args.start, - args.length); - homa_sock_unlock(hsk); - INC_METRIC(so_set_buf_calls, 1); - INC_METRIC(so_set_buf_cycles, get_cycles() - start); - return ret; + if (copy_from_sockptr(&args, optval, optlen)) { + hsk->error_msg = "invalid address for homa_rcvbuf_args"; + return -EFAULT; + } + + /* Do a trivial test to make sure we can at least write the + * first page of the region. + */ + if (copy_to_user(u64_to_user_ptr(args.start), &args, + sizeof(args))) { + hsk->error_msg = "receive buffer region is not writable"; + return -EFAULT; + } + + ret = homa_pool_set_region(hsk, u64_to_user_ptr(args.start), + args.length); + INC_METRIC(so_set_buf_calls, 1); + INC_METRIC(so_set_buf_cycles, homa_clock() - start); + } else if (optname == SO_HOMA_SERVER) { + int arg; + if (optlen != sizeof(arg)) { + hsk->error_msg = "invalid optlen argument: must be sizeof(int)"; + return -EINVAL; + } + + if (copy_from_sockptr(&arg, optval, optlen)) { + hsk->error_msg = "invalid address for SO_HOMA_SERVER value"; + return -EFAULT; + } + + if (arg) + hsk->is_server = true; + else + hsk->is_server = false; + ret = 0; + } else { + hsk->error_msg = "setsockopt option not supported by Homa"; + ret = -ENOPROTOOPT; + } + return ret; } /** * homa_getsockopt() - Implements the getsockopt system call for Homa sockets. * @sk: Socket on which the system call was invoked. - * @level: ?? + * @level: Selects level in the network stack to handle the request; + * must be IPPROTO_HOMA. * @optname: Identifies a particular setsockopt operation. * @optval: Address in user space where the option's value should be stored. - * @option: ??. - * Return: 0 on success, otherwise a negative errno. + * @optlen: Number of bytes available at optval; will be overwritten with + * actual number of bytes stored. + * Return: 0 on success, otherwise a negative errno. Sets hsk->error_msg + * on errors. */ int homa_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *option) { - printk(KERN_WARNING "unimplemented getsockopt invoked on Homa socket:" - " level %d, optname %d\n", level, optname); - return -EINVAL; + char __user *optval, int __user *optlen) +{ + struct homa_sock *hsk = homa_sk(sk); + struct homa_rcvbuf_args rcvbuf_args; + int is_server; + void *result; + int len; + + if (copy_from_sockptr(&len, USER_SOCKPTR(optlen), sizeof(int))) { + hsk->error_msg = "invalid address for optlen argument to getsockopt"; + return -EFAULT; + } + if (level != IPPROTO_HOMA) { + hsk->error_msg = "homa_setsockopt invoked with level not IPPROTO_HOMA"; + return -ENOPROTOOPT; + } + if (optname == SO_HOMA_RCVBUF) { + if (len < sizeof(rcvbuf_args)) { + hsk->error_msg = "invalid optlen argument: must be sizeof(struct homa_rcvbuf_args)"; + return -EINVAL; + } + + homa_sock_lock(hsk); + homa_pool_get_rcvbuf(hsk->buffer_pool, &rcvbuf_args); + homa_sock_unlock(hsk); + len = sizeof(rcvbuf_args); + result = &rcvbuf_args; + } else if (optname == SO_HOMA_SERVER) { + if (len < sizeof(is_server)) { + hsk->error_msg = "invalid optlen argument: must be sizeof(int)"; + return -EINVAL; + } + + is_server = hsk->is_server; + len = sizeof(is_server); + result = &is_server; + } else { + hsk->error_msg = "getsockopt option not supported by Homa"; + return -ENOPROTOOPT; + } + + if (copy_to_sockptr(USER_SOCKPTR(optlen), &len, sizeof(int))) { + hsk->error_msg = "couldn't update optlen argument to getsockopt: read-only?"; + return -EFAULT; + } + + if (copy_to_sockptr(USER_SOCKPTR(optval), result, len)) { + hsk->error_msg = "couldn't update optval argument to getsockopt: read-only?"; + return -EFAULT; + } + + return 0; } /** * homa_sendmsg() - Send a request or response message on a Homa socket. - * @sk: Socket on which the system call was invoked. - * @msg: Structure describing the message to send; the msg_control - * field points to additional information. - * @len: Number of bytes of the message. - * Return: 0 on success, otherwise a negative errno. + * @sk: Socket on which the system call was invoked. + * @msg: Structure describing the message to send; the msg_control + * field points to additional information. + * @length: Number of bytes of the message. + * Return: 0 on success, otherwise a negative errno. Sets hsk->error_msg + * on errors. */ -int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) { +int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) +{ struct homa_sock *hsk = homa_sk(sk); struct homa_sendmsg_args args; - __u64 start = get_cycles(); - __u64 finish; - int result = 0; + union sockaddr_in_union *addr; struct homa_rpc *rpc = NULL; - sockaddr_in_union *addr = (sockaddr_in_union *) msg->msg_name; + int result = 0; + + IF_NO_STRIP(u64 start = homa_clock()); + IF_NO_STRIP(u64 finish); + +#ifndef __STRIP__ /* See strip.py */ + per_cpu(homa_offload_core, raw_smp_processor_id()).last_app_active = + start; +#endif /* See strip.py */ + + addr = (union sockaddr_in_union *)msg->msg_name; + if (!addr) { + hsk->error_msg = "no msg_name passed to sendmsg"; + result = -EINVAL; + goto error; + } if (unlikely(!msg->msg_control_is_user)) { + tt_record("homa_sendmsg error: !msg->msg_control_is_user"); + hsk->error_msg = "msg_control argument for sendmsg isn't in user space"; result = -EINVAL; goto error; } - if (unlikely(copy_from_user(&args, msg->msg_control, - sizeof(args)))) { + if (unlikely(copy_from_user(&args, (void __user *)msg->msg_control, + sizeof(args)))) { + hsk->error_msg = "invalid address for msg_control argument to sendmsg"; result = -EFAULT; goto error; } - if (addr->in6.sin6_family != sk->sk_family) { + if (args.flags & ~HOMA_SENDMSG_VALID_FLAGS || + args.reserved != 0) { + hsk->error_msg = "reserved fields in homa_sendmsg_args must be zero"; + result = -EINVAL; + goto error; + } + + if (!homa_sock_wmem_avl(hsk)) { + result = homa_sock_wait_wmem(hsk, + msg->msg_flags & MSG_DONTWAIT); + if (result != 0) + goto error; + } + + if (addr->sa.sa_family != sk->sk_family) { + hsk->error_msg = "address family in sendmsg address must match the socket"; result = -EAFNOSUPPORT; goto error; } - if ((msg->msg_namelen < sizeof(struct sockaddr_in)) - || ((msg->msg_namelen < sizeof(struct sockaddr_in6)) - && (addr->in6.sin6_family == AF_INET6))) { + if (msg->msg_namelen < sizeof(struct sockaddr_in) || + (msg->msg_namelen < sizeof(struct sockaddr_in6) && + addr->in6.sin6_family == AF_INET6)) { + hsk->error_msg = "msg_namelen too short"; result = -EINVAL; goto error; } if (!args.id) { /* This is a request message. */ - INC_METRIC(send_calls, 1); - tt_record4("homa_sendmsg request, target 0x%x:%d, id %u, length %d", - (addr->in6.sin6_family == AF_INET) - ? ntohl(addr->in4.sin_addr.s_addr) - : tt_addr(addr->in6.sin6_addr), - ntohs(addr->in6.sin6_port), - atomic64_read(&hsk->homa->next_outgoing_id), - length); - - rpc = homa_rpc_new_client(hsk, addr); + rpc = homa_rpc_alloc_client(hsk, addr); if (IS_ERR(rpc)) { result = PTR_ERR(rpc); rpc = NULL; goto error; } + homa_rpc_hold(rpc); + if (args.flags & HOMA_SENDMSG_PRIVATE) + set_bit(RPC_PRIVATE, &rpc->flags); + INC_METRIC(send_calls, 1); + tt_record4("homa_sendmsg request, target 0x%x:%d, id %u, length %d", + (addr->in6.sin6_family == AF_INET) + ? ntohl(addr->in4.sin_addr.s_addr) + : tt_addr(addr->in6.sin6_addr), + ntohs(addr->in6.sin6_port), rpc->id, length); rpc->completion_cookie = args.completion_cookie; - result = homa_message_out_init(rpc, &msg->msg_iter, 1); + result = homa_message_out_fill(rpc, &msg->msg_iter, 1); if (result) goto error; args.id = rpc->id; - homa_rpc_unlock(rpc); - rpc = NULL; + homa_rpc_unlock(rpc); /* Locked by homa_rpc_alloc_client. */ - if (unlikely(copy_to_user(msg->msg_control, &args, - sizeof(args)))) { - rpc = homa_find_client_rpc(hsk, args.id); + if (unlikely(copy_to_user((void __user *)msg->msg_control, + &args, sizeof(args)))) { + homa_rpc_lock(rpc); + hsk->error_msg = "couldn't update homa_sendmsg_args argument to sendmsg: read-only?"; result = -EFAULT; goto error; } - finish = get_cycles(); + homa_rpc_put(rpc); +#ifndef __STRIP__ /* See strip.py */ + finish = homa_clock(); +#endif /* See strip.py */ INC_METRIC(send_cycles, finish - start); + INC_METRIC(client_requests_started, 1); + INC_METRIC(client_request_bytes_started, length); } else { /* This is a response message. */ struct in6_addr canonical_dest; INC_METRIC(reply_calls, 1); tt_record4("homa_sendmsg response, id %llu, port %d, pid %d, length %d", - args.id, hsk->port, current->pid, length); + args.id, hsk->port, current->pid, length); if (args.completion_cookie != 0) { + hsk->error_msg = "completion_cookie must be zero when sending responses"; result = -EINVAL; goto error; } canonical_dest = canonical_ipv6_addr(addr); - rpc = homa_find_server_rpc(hsk, &canonical_dest, - ntohs(addr->in6.sin6_port), args.id); + rpc = homa_rpc_find_server(hsk, &canonical_dest, args.id); if (!rpc) { - result = -EINVAL; - goto error; + /* Return without an error if the RPC doesn't exist; + * this could be totally valid (e.g. client is + * no longer interested in it). + */ + return 0; } + homa_rpc_hold(rpc); if (rpc->error) { + hsk->error_msg = "RPC has failed, so can't send response"; result = rpc->error; goto error; } if (rpc->state != RPC_IN_SERVICE) { - homa_rpc_unlock(rpc); - rpc = 0; + hsk->error_msg = "RPC is not in a state where a response can be sent"; result = -EINVAL; - goto error; + goto error_dont_end_rpc; } rpc->state = RPC_OUTGOING; - result = homa_message_out_init(rpc, &msg->msg_iter, 1); - if (result) + result = homa_message_out_fill(rpc, &msg->msg_iter, 1); + if (result && rpc->state != RPC_DEAD) goto error; - homa_rpc_unlock(rpc); - finish = get_cycles(); + homa_rpc_put(rpc); + homa_rpc_unlock(rpc); /* Locked by homa_rpc_find_server. */ +#ifndef __STRIP__ /* See strip.py */ + finish = homa_clock(); +#endif /* See strip.py */ INC_METRIC(reply_cycles, finish - start); + INC_METRIC(server_responses_started, 1); + INC_METRIC(server_response_bytes_started, length); } tt_record1("homa_sendmsg finished, id %d", args.id); return 0; error: + if (rpc) + homa_rpc_end(rpc); + +error_dont_end_rpc: if (rpc) { - homa_rpc_free(rpc); + homa_rpc_put(rpc); + + /* Locked by homa_rpc_find_server or homa_rpc_alloc_client. */ homa_rpc_unlock(rpc); } tt_record2("homa_sendmsg returning error %d for id %d", - result, args.id); - tt_freeze(); + result, args.id); return result; } @@ -965,237 +1307,215 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) { * @sk: Socket on which the system call was invoked. * @msg: Controlling information for the receive. * @len: Total bytes of space available in msg->msg_iov; not used. - * @nonblocking: Non-zero means MSG_DONTWAIT was specified. - * @flags: Flags from system call, not including MSG_DONTWAIT; ignored. + * @flags: Flags from system call; only MSG_DONTWAIT is used. * @addr_len: Store the length of the sender address here * Return: The length of the message on success, otherwise a negative - * errno. + * errno. Sets hsk->error_msg on errors. */ -int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblocking, int flags, int *addr_len) +int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, + int *addr_len) { struct homa_sock *hsk = homa_sk(sk); struct homa_recvmsg_args control; - __u64 start = get_cycles(); - struct homa_rpc *rpc; - __u64 finish; + struct homa_rpc *rpc = NULL; + int nonblocking; int result; + IF_NO_STRIP(u64 start = homa_clock()); + IF_NO_STRIP(u64 finish); + INC_METRIC(recv_calls, 1); +#ifndef __STRIP__ /* See strip.py */ + per_cpu(homa_offload_core, raw_smp_processor_id()).last_app_active = start; +#endif /* See strip.py */ if (unlikely(!msg->msg_control)) { /* This test isn't strictly necessary, but it provides a * hook for testing kernel call times. */ + hsk->error_msg = "no msg_control passed to recvmsg"; return -EINVAL; } if (msg->msg_controllen != sizeof(control)) { - result = -EINVAL; - goto done; + hsk->error_msg = "invalid msg_controllen in recvmsg"; + return -EINVAL; } - if (unlikely(copy_from_user(&control, msg->msg_control, - sizeof(control)))) { - result = -EFAULT; - goto done; + if (unlikely(copy_from_user(&control, (void __user *)msg->msg_control, + sizeof(control)))) { + hsk->error_msg = "invalid address for msg_control argument to recvmsg"; + return -EFAULT; } control.completion_cookie = 0; - if (control._pad[0] || control._pad[1]) { + tt_record2("homa_recvmsg starting, port %d, pid %d", + hsk->port, current->pid); + + if (control.num_bpages > HOMA_MAX_BPAGES) { + hsk->error_msg = "num_pages exceeds HOMA_MAX_BPAGES"; result = -EINVAL; goto done; } - tt_record3("homa_recvmsg starting, port %d, pid %d, flags %d", - hsk->port, current->pid, control.flags); - - if ((control.num_bpages > HOMA_MAX_BPAGES) - || (control.flags & ~HOMA_RECVMSG_VALID_FLAGS)) { + if (control.reserved != 0) { + hsk->error_msg = "reserved fields in homa_recvmsg_args must be zero"; + result = -EINVAL; + goto done; + } + if (!hsk->buffer_pool) { + hsk->error_msg = "SO_HOMA_RECVBUF socket option has not been set"; result = -EINVAL; goto done; } - homa_pool_release_buffers(&hsk->buffer_pool, control.num_bpages, - control.bpage_offsets); + result = homa_pool_release_buffers(hsk->buffer_pool, control.num_bpages, + control.bpage_offsets); control.num_bpages = 0; - - rpc = homa_wait_for_message(hsk, nonblocking - ? (control.flags | HOMA_RECVMSG_NONBLOCKING) - : control.flags, control.id); - if (IS_ERR(rpc)) { - /* If we get here, it means there was an error that prevented - * us from finding an RPC to return. If there's an error in - * the RPC itself we won't get here. - */ - result = PTR_ERR(rpc); + if (result != 0) { + hsk->error_msg = "error while releasing buffer pages"; goto done; } - result = rpc->error ? rpc->error : rpc->msgin.total_length; + nonblocking = flags & MSG_DONTWAIT; + if (control.id != 0) { + rpc = homa_rpc_find_client(hsk, control.id); /* Locks RPC. */ + if (!rpc) { + hsk->error_msg = "invalid RPC id passed to recvmsg"; + result = -EINVAL; + goto done; + } + homa_rpc_hold(rpc); + result = homa_wait_private(rpc, nonblocking); + if (result != 0) { + hsk->error_msg = "error while waiting for private RPC to complete"; + control.id = 0; + goto done; + } + } else { + rpc = homa_wait_shared(hsk, nonblocking); + if (IS_ERR(rpc)) { + /* If we get here, it means there was an error that + * prevented us from finding an RPC to return. Errors + * in the RPC itself are handled below. + */ + hsk->error_msg = "error while waiting for shared RPC to complete"; + result = PTR_ERR(rpc); + rpc = NULL; + goto done; + } + } + if (rpc->error) { + hsk->error_msg = "RPC failed"; + result = rpc->error; + } else { + result = rpc->msgin.length; + } + result = rpc->error ? rpc->error : rpc->msgin.length; + +#ifndef __STRIP__ /* See strip.py */ /* Generate time traces on both ends for long elapsed times (used * for performance debugging). */ if (rpc->hsk->homa->freeze_type == SLOW_RPC) { - uint64_t elapsed = (get_cycles() - rpc->start_cycles)>>10; - if ((elapsed <= hsk->homa->temp[1]) - && (elapsed >= hsk->homa->temp[0]) - && homa_is_client(rpc->id) - && (rpc->msgin.total_length < 500)) { - tt_record4("Long RTT: kcycles %d, id %d, peer 0x%x, " - "length %d", - elapsed, rpc->id, - tt_addr(rpc->peer->addr), - rpc->msgin.total_length); - homa_freeze(rpc, SLOW_RPC, "Freezing because of long " - "elapsed time for RPC id %d, peer 0x%x"); + u64 elapsed = (homa_clock() - rpc->start_time) >> 10; + + if (elapsed <= hsk->homa->temp[1] && + elapsed >= hsk->homa->temp[0] && + homa_is_client(rpc->id) && + rpc->msgin.length >= hsk->homa->temp[2] && + rpc->msgin.length < hsk->homa->temp[3]) { + tt_record4("Long RTT: kcycles %d, id %d, peer 0x%x, length %d", + elapsed, rpc->id, tt_addr(rpc->peer->addr), + rpc->msgin.length); + homa_freeze(rpc, SLOW_RPC, + "Freezing because of long elapsed time for RPC id %d, peer 0x%x"); } } +#endif /* See strip.py */ /* Collect result information. */ control.id = rpc->id; control.completion_cookie = rpc->completion_cookie; - if (likely(rpc->msgin.total_length >= 0)) { + if (likely(rpc->msgin.length >= 0)) { control.num_bpages = rpc->msgin.num_bpages; memcpy(control.bpage_offsets, rpc->msgin.bpage_offsets, - sizeof(control.bpage_offsets)); + sizeof(rpc->msgin.bpage_offsets)); } if (sk->sk_family == AF_INET6) { struct sockaddr_in6 *in6 = msg->msg_name; + in6->sin6_family = AF_INET6; in6->sin6_port = htons(rpc->dport); in6->sin6_addr = rpc->peer->addr; *addr_len = sizeof(*in6); } else { struct sockaddr_in *in4 = msg->msg_name; + in4->sin_family = AF_INET; in4->sin_port = htons(rpc->dport); - in4->sin_addr.s_addr = ipv6_to_ipv4( - rpc->peer->addr); + in4->sin_addr.s_addr = ipv6_to_ipv4(rpc->peer->addr); *addr_len = sizeof(*in4); } + /* This indicates that the application now owns the buffers, so - * we won't free them in homa_rpc_free. + * we won't free them in homa_rpc_end. */ rpc->msgin.num_bpages = 0; - /* Must release the RPC lock (and potentially free the RPC) before - * copying the results back to user space. - */ if (homa_is_client(rpc->id)) { homa_peer_add_ack(rpc); - homa_rpc_free(rpc); + homa_rpc_end(rpc); } else { if (result < 0) - homa_rpc_free(rpc); + homa_rpc_end(rpc); else rpc->state = RPC_IN_SERVICE; } - homa_rpc_unlock(rpc); done: - if (unlikely(copy_to_user(msg->msg_control, &control, sizeof(control)))) { - /* Note: in this case the message's buffers will be leaked. */ - printk(KERN_NOTICE "homa_recvmsg couldn't copy back args\n"); - result = -EFAULT; + /* Note: must release the RPC lock before calling homa_rpc_reap + * or copying results to user space. + */ + if (rpc) { + homa_rpc_put(rpc); + + /* Locked by homa_rpc_find_client or homa_wait_shared. */ + homa_rpc_unlock(rpc); } - /* This is needed to compensate for ____sys_recvmsg (which writes the - * after-before difference for this value back as msg_controllen in - * the user's struct msghdr) so that the value in the user's struct - * doesn't change. - */ - msg->msg_control = ((char *) msg->msg_control) - + sizeof(struct homa_recvmsg_args); + if (test_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags)) { + /* There are tasks waiting for tx memory, so reap + * immediately. + */ + homa_rpc_reap(hsk, true); + } - finish = get_cycles(); - tt_record3("homa_recvmsg returning id %d, length %d, bpage0 %d", - control.id, result, - control.bpage_offsets[0] >> HOMA_BPAGE_SHIFT); + if (unlikely(copy_to_user((__force void __user *)msg->msg_control, + &control, sizeof(control)))) { + hsk->error_msg = "couldn't update homa_recvmsg_args argument to recvmsg: read-only?"; + result = -EFAULT; + } + +#ifndef __STRIP__ /* See strip.py */ + finish = homa_clock(); +#endif /* See strip.py */ INC_METRIC(recv_cycles, finish - start); + tt_record2("homa_recvmsg returning status %d, id %d", result, + control.id); return result; } /** - * homa_sendpage() - ??. - * @sk: Socket for the operation - * @page: ?? - * @offset: ?? - * @size: ?? - * @flags: ?? - * Return: 0 on success, otherwise a negative errno. - */ -int homa_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags) { - printk(KERN_WARNING "unimplemented sendpage invoked on Homa socket\n"); - return -ENOSYS; -} - -/** - * homa_hash() - ??. + * homa_hash() - Not needed for Homa. * @sk: Socket for the operation * Return: ?? */ -int homa_hash(struct sock *sk) { - printk(KERN_WARNING "unimplemented hash invoked on Homa socket\n"); - return 0; -} - -/** - * homa_unhash() - ??. - * @sk: Socket for the operation - */ -void homa_unhash(struct sock *sk) { - return; - printk(KERN_WARNING "unimplemented unhash invoked on Homa socket\n"); -} - -/** - * homa_rehash() - ??. - * @sk: Socket for the operation - */ -void homa_rehash(struct sock *sk) { - printk(KERN_WARNING "unimplemented rehash invoked on Homa socket\n"); -} - -/** - * homa_get_port() - It appears that this function is called to assign a - * default port for a socket. - * @sk: Socket for the operation - * @snum: Unclear what this is. - * Return: Zero for success, or a negative errno for an error. - */ -int homa_get_port(struct sock *sk, unsigned short snum) { - /* Homa always assigns ports immediately when a socket is created, - * so there is nothing to do here. - */ +int homa_hash(struct sock *sk) +{ return 0; } /** - * homa_diag_destroy() - ??. + * homa_unhash() - Not needed for Homa. * @sk: Socket for the operation - * @err: ?? - * Return: ?? - */ -int homa_diag_destroy(struct sock *sk, int err) { - printk(KERN_WARNING "unimplemented diag_destroy invoked on Homa socket\n"); - return -ENOSYS; - -} - -/** - * homa_v4_early_demux() - Invoked by IP for ??. - * @skb: Socket buffer. - * Return: Always 0? - */ -int homa_v4_early_demux(struct sk_buff *skb) { - printk(KERN_WARNING "unimplemented early_demux invoked on Homa socket\n"); - return 0; -} - -/** - * homa_v4_early_demux_handler() - invoked by IP for ??. - * @skb: Socket buffer. - * @return: Always 0? */ -int homa_v4_early_demux_handler(struct sk_buff *skb) { - printk(KERN_WARNING "unimplemented early_demux_handler invoked on Homa socket\n"); - return 0; +void homa_unhash(struct sock *sk) +{ } /** @@ -1204,185 +1524,170 @@ int homa_v4_early_demux_handler(struct sk_buff *skb) { * @skb: The incoming packet. * Return: Always 0 */ -int homa_softirq(struct sk_buff *skb) { - struct common_header *h; - struct sk_buff *packets, *short_packets, *next; - struct sk_buff **prev_link, **short_link; - __u16 dport; - static __u64 last = 0; - __u64 start; +int homa_softirq(struct sk_buff *skb) +{ + struct sk_buff *packets, *other_pkts, *next; + struct sk_buff **prev_link, **other_link; + enum skb_drop_reason reason; + struct homa_common_hdr *h; int header_offset; - int first_packet = 1; - struct homa_sock *hsk; - int num_packets = 0; - int pull_length; - struct homa_lcache lcache; - - /* Accumulates changes to homa->incoming, to avoid repeated - * updates to this shared variable. - */ - int incoming_delta = 0; - start = get_cycles(); + IF_NO_STRIP(struct homa *homa = homa_net(dev_net(skb->dev))->homa); + IF_NO_STRIP(u64 start); + +#ifndef __STRIP__ /* See strip.py */ + start = homa_clock(); + per_cpu(homa_offload_core, raw_smp_processor_id()).last_active = start; +#endif /* See strip.py */ INC_METRIC(softirq_calls, 1); - homa_cores[raw_smp_processor_id()]->last_active = start; - homa_lcache_init(&lcache); - if ((start - last) > 1000000) { - int scaled_ms = (int) (10*(start-last)/cpu_khz); - if ((scaled_ms >= 50) && (scaled_ms < 10000)) { -// tt_record3("Gap in incoming packets: %d cycles " -// "(%d.%1d ms)", -// (int) (start - last), scaled_ms/10, -// scaled_ms%10); -// printk(KERN_NOTICE "Gap in incoming packets: %llu " -// "cycles, (%d.%1d ms)", (start - last), -// scaled_ms/10, scaled_ms%10); - } - } - last = start; /* skb may actually contain many distinct packets, linked through - * skb_shinfo(skb)->frag_list by the Homa GRO mechanism. First, pull - * out all the short packets into a separate list, then splice this - * list into the front of the packet list, so that all the short - * packets will get served first. + * skb_shinfo(skb)->frag_list by the Homa GRO mechanism. Make a + * pass through the list to process all of the short packets, + * leaving the longer packets in the list. Also, perform various + * prep/cleanup/error checking functions. */ - + tt_record("homa_softirq starting"); skb->next = skb_shinfo(skb)->frag_list; skb_shinfo(skb)->frag_list = NULL; packets = skb; prev_link = &packets; - short_packets = NULL; - short_link = &short_packets; - for (skb = packets; skb != NULL; skb = skb->next) { - if (skb->len < 1400) { - *prev_link = skb->next; - *short_link = skb; - short_link = &skb->next; - } else - prev_link = &skb->next; - } - *short_link = packets; - packets = short_packets; - - for (skb = packets; skb != NULL; skb = next) { - const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); + for (skb = packets; skb; skb = next) { next = skb->next; - num_packets++; - /* The code below makes the header available at skb->data, even - * if the packet is fragmented. One complication: it's possible - * that the IP header hasn't yet been removed (this happens for - * GRO packets on the frag_list, since they aren't handled - * explicitly by IP. + /* Make the header available at skb->data, even if the packet + * is fragmented. One complication: it's possible that the IP + * header hasn't yet been removed (this happens for GRO packets + * on the frag_list, since they aren't handled explicitly by IP. */ - header_offset = skb_transport_header(skb) - skb->data; - pull_length = HOMA_MAX_HEADER + header_offset; - if (pull_length > skb->len) - pull_length = skb->len; - if (!pskb_may_pull(skb, pull_length)) { + if (!homa_make_header_avl(skb)) { +#ifndef __STRIP__ /* See strip.py */ if (homa->verbose) - printk(KERN_NOTICE "Homa can't handle fragmented " - "packet (no space for header); " - "discarding\n"); + pr_notice("Homa can't handle fragmented packet (no space for header); discarding\n"); +#endif /* See strip.py */ UNIT_LOG("", "pskb discard"); + reason = SKB_DROP_REASON_HDR_TRUNC; goto discard; } + header_offset = skb_transport_header(skb) - skb->data; if (header_offset) __skb_pull(skb, header_offset); - h = (struct common_header *) skb->data; - if (unlikely((skb->len < sizeof(struct common_header)) - || (h->type < DATA) - || (h->type >= BOGUS) - || (skb->len < header_lengths[h->type-DATA]))) { + /* Reject packets that are too short or have bogus types. */ + h = (struct homa_common_hdr *)skb->data; + if (unlikely(skb->len < sizeof(struct homa_common_hdr) || + h->type < DATA || h->type > MAX_OP || + skb->len < header_lengths[h->type - DATA])) { +#ifndef __STRIP__ /* See strip.py */ + const struct in6_addr saddr = + skb_canonical_ipv6_saddr(skb); if (homa->verbose) - printk(KERN_WARNING - "Homa %s packet from %s too " - "short: %d bytes\n", - homa_symbol_for_type(h->type), - homa_print_ipv6_addr(&saddr), - skb->len - header_offset); + pr_warn("Homa %s packet from %s too short: %d bytes\n", + homa_symbol_for_type(h->type), + homa_print_ipv6_addr(&saddr), + skb->len - header_offset); +#endif /* See strip.py */ INC_METRIC(short_packets, 1); + reason = SKB_DROP_REASON_PKT_TOO_SMALL; goto discard; } - if (first_packet) { - tt_record4("homa_softirq: first packet from 0x%x:%d, " - "id %llu, type %d", - tt_addr(saddr), ntohs(h->sport), - homa_local_id(h->sender_id), h->type); - first_packet = 0; - } +#ifndef __UPSTREAM__ /* See strip.py */ + /* Check for FREEZE here, rather than in homa_incoming.c, so + * it will work even if the RPC and/or socket are unknown. + */ if (unlikely(h->type == FREEZE)) { - /* Check for FREEZE here, rather than in homa_incoming.c, - * so it will work even if the RPC and/or socket are - * unknown. - */ - if (!tt_frozen) { - tt_record4("Freezing because of request on " - "port %d from 0x%x:%d, id %d", - ntohs(h->dport), tt_addr(saddr), - ntohs(h->sport), - homa_local_id(h->sender_id)); + if (!atomic_read(&tt_frozen)) { + homa_rpc_log_active_tt(homa_net( + dev_net(skb->dev))->homa, 0); + tt_record4("Freezing because of request on port %d from 0x%x:%d, id %d", + ntohs(h->dport), + tt_addr(skb_canonical_ipv6_saddr(skb)), + ntohs(h->sport), + homa_local_id(h->sender_id)); tt_freeze(); -// homa_rpc_log_active(homa, h->id); -// homa_log_grantable_list(homa); -// homa_log_throttled(homa); } + reason = SKB_CONSUMED; goto discard; } +#endif /* See strip.py */ - dport = ntohs(h->dport); - hsk = homa_sock_find(&homa->port_map, dport); - if (!hsk) { - if (skb_is_ipv6(skb)) - icmp6_send(skb, ICMPV6_DEST_UNREACH, - ICMPV6_PORT_UNREACH, 0, NULL, - IP6CB(skb)); - else - icmp_send(skb, ICMP_DEST_UNREACH, - ICMP_PORT_UNREACH, 0); - tt_record3("Discarding packet for unknown port %u, " - "id %llu, type %d", dport, - homa_local_id(h->sender_id), h->type); - goto discard; + /* Process the packet now if it is a control packet or + * if it contains an entire short message. + */ + if (h->type != DATA || ntohl(((struct homa_data_hdr *)h) + ->message_length) < 1400) { + UNIT_LOG("; ", "homa_softirq shortcut type 0x%x", + h->type); + *prev_link = skb->next; + skb->next = NULL; + homa_dispatch_pkts(skb); + } else { + prev_link = &skb->next; } - - homa_pkt_dispatch(skb, hsk, &lcache, &incoming_delta); continue; discard: - kfree_skb(skb); + *prev_link = skb->next; + kfree_skb_reason(skb, reason); } - homa_lcache_release(&lcache); - atomic_add(incoming_delta, &homa->total_incoming); - homa_send_grants(homa); - atomic_dec(&homa_cores[raw_smp_processor_id()]->softirq_backlog); - INC_METRIC(softirq_cycles, get_cycles() - start); - return 0; -} + /* Now process the longer packets. Each iteration of this loop + * collects all of the packets for a particular RPC and dispatches + * them (batching the packets for an RPC allows more efficient + * generation of grants). + */ + while (packets) { + struct in6_addr saddr, saddr2; + struct homa_common_hdr *h2; + struct sk_buff *skb2; + + skb = packets; + prev_link = &skb->next; + saddr = skb_canonical_ipv6_saddr(skb); + other_pkts = NULL; + other_link = &other_pkts; + h = (struct homa_common_hdr *)skb->data; + for (skb2 = skb->next; skb2; skb2 = next) { + next = skb2->next; + h2 = (struct homa_common_hdr *)skb2->data; + if (h2->sender_id == h->sender_id) { + saddr2 = skb_canonical_ipv6_saddr(skb2); + if (ipv6_addr_equal(&saddr, &saddr2)) { + *prev_link = skb2; + prev_link = &skb2->next; + continue; + } + } + *other_link = skb2; + other_link = &skb2->next; + } + *prev_link = NULL; + *other_link = NULL; +#ifdef __UNIT_TEST__ + UNIT_LOG("; ", "id %lld, offsets", homa_local_id(h->sender_id)); + for (skb2 = packets; skb2; skb2 = skb2->next) { + struct homa_data_hdr *h3 = (struct homa_data_hdr *) + skb2->data; + UNIT_LOG("", " %d", ntohl(h3->seg.offset)); + } +#endif /* __UNIT_TEST__ */ + homa_dispatch_pkts(packets); + packets = other_pkts; + } -/** - * homa_backlog_rcv() - Invoked to handle packets saved on a socket's - * backlog because it was locked when the packets first arrived. - * @sk: Homa socket that owns the packet's destination port. - * @skb: The incoming packet. This function takes ownership of the packet - * (we'll delete it). - * - * Return: Always returns 0. - */ -int homa_backlog_rcv(struct sock *sk, struct sk_buff *skb) -{ - printk(KERN_WARNING "unimplemented backlog_rcv invoked on Homa socket\n"); - kfree_skb(skb); +#ifndef __STRIP__ /* See strip.py */ + atomic_dec(&per_cpu(homa_offload_core, raw_smp_processor_id()).softirq_backlog); +#endif /* See strip.py */ + INC_METRIC(softirq_cycles, homa_clock() - start); return 0; } /** * homa_err_handler_v4() - Invoked by IP to handle an incoming error * packet, such as ICMP UNREACHABLE. + * @skb: The incoming packet; skb->data points to the byte just after + * the ICMP header (the first byte of the embedded packet IP header). * @skb: The incoming packet. * @info: Information about the error that occurred? * @@ -1390,40 +1695,47 @@ int homa_backlog_rcv(struct sock *sk, struct sk_buff *skb) */ int homa_err_handler_v4(struct sk_buff *skb, u32 info) { - const struct iphdr *iph = ip_hdr(skb); - int type = icmp_hdr(skb)->type; - int code = icmp_hdr(skb)->code; - const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); - - if ((type == ICMP_DEST_UNREACH) && (code == ICMP_PORT_UNREACH)) { - struct common_header *h; - char *icmp = (char *) icmp_hdr(skb); - iph = (struct iphdr *) (icmp + sizeof(struct icmphdr)); - h = (struct common_header *) (icmp + sizeof(struct icmphdr) - + iph->ihl*4); - homa_abort_rpcs(homa, &saddr, htons(h->dport), -ENOTCONN); + struct homa *homa = homa_net(dev_net(skb->dev))->homa; + const struct icmphdr *icmp = icmp_hdr(skb); + struct in6_addr daddr; + int type = icmp->type; + int code = icmp->code; + struct iphdr *iph; + int error = 0; + int port = 0; + + iph = (struct iphdr *)(skb->data); + ipv6_addr_set_v4mapped(iph->daddr, &daddr); + if (type == ICMP_DEST_UNREACH && code == ICMP_PORT_UNREACH) { + struct homa_common_hdr *h = (struct homa_common_hdr *)(skb->data + + iph->ihl * 4); + + port = ntohs(h->dport); + error = -ENOTCONN; } else if (type == ICMP_DEST_UNREACH) { - int error; if (code == ICMP_PROT_UNREACH) error = -EPROTONOSUPPORT; else error = -EHOSTUNREACH; - tt_record2("ICMP destination unreachable: 0x%x (daddr 0x%x)", - iph->saddr, iph->daddr); - homa_abort_rpcs(homa, &saddr, 0, error); } else { - printk(KERN_NOTICE "homa_err_handler_v4 invoked with " - "info %x, ICMP type %d, ICMP code %d\n", - info, type, code); + pr_notice("%s invoked with info %x, ICMP type %d, ICMP code %d\n", + __func__, info, type, code); } + if (error != 0) + homa_abort_rpcs(homa, &daddr, port, error); return 0; } /** * homa_err_handler_v6() - Invoked by IP to handle an incoming error * packet, such as ICMP UNREACHABLE. - * @skb: The incoming packet. - * @info: Information about the error that occurred? + * @skb: The incoming packet; skb->data points to the byte just after + * the ICMP header (the first byte of the embedded packet IP header). + * @opt: Not used. + * @type: Type of ICMP packet. + * @code: Additional information about the error. + * @offset: Not used. + * @info: Information about the error that occurred? * * Return: zero, or a negative errno if the error couldn't be handled here. */ @@ -1431,29 +1743,23 @@ int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; - - if ((type == ICMPV6_DEST_UNREACH) && (code == ICMPV6_PORT_UNREACH)) { - struct common_header *h; - char *icmp = (char *) icmp_hdr(skb); - iph = (struct ipv6hdr *) (icmp + sizeof(struct icmphdr)); - h = (struct common_header *) (icmp + sizeof(struct icmphdr) - + HOMA_IPV6_HEADER_LENGTH); - homa_abort_rpcs(homa, &iph->daddr, htons(h->dport), -ENOTCONN); - } else if (type == ICMPV6_DEST_UNREACH) { - int error; - if (code == ICMP_PROT_UNREACH) - error = -EPROTONOSUPPORT; - else - error = -EHOSTUNREACH; - tt_record2("ICMPv6 destination unreachable: 0x%x (daddr 0x%x)", - tt_addr(iph->saddr), tt_addr(iph->daddr)); - homa_abort_rpcs(homa, &iph->daddr, 0, error); - } else { - if (homa->verbose) - printk(KERN_NOTICE "homa_err_handler_v6 invoked with " - "info %x, ICMP type %d, ICMP code %d\n", - info, type, code); + struct homa *homa = homa_net(dev_net(skb->dev))->homa; + int error = 0; + int port = 0; + + if (type == ICMPV6_DEST_UNREACH && code == ICMPV6_PORT_UNREACH) { + const struct homa_common_hdr *h; + + h = (struct homa_common_hdr *)(skb->data + sizeof(*iph)); + port = ntohs(h->dport); + error = -ENOTCONN; + } else if (type == ICMPV6_DEST_UNREACH && code == ICMPV6_ADDR_UNREACH) { + error = -EHOSTUNREACH; + } else if (type == ICMPV6_PARAMPROB && code == ICMPV6_UNK_NEXTHDR) { + error = -EPROTONOSUPPORT; } + if (error != 0) + homa_abort_rpcs(homa, &iph->daddr, port, error); return 0; } @@ -1469,108 +1775,31 @@ int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, * state of the socket. */ __poll_t homa_poll(struct file *file, struct socket *sock, - struct poll_table_struct *wait) { - struct sock *sk = sock->sk; + struct poll_table_struct *wait) +{ + struct homa_sock *hsk = homa_sk(sock->sk); __poll_t mask; - /* It seems to be standard practice for poll functions *not* to - * acquire the socket lock, so we don't do it here; not sure - * why... - */ - + mask = 0; sock_poll_wait(file, sock, wait); - mask = POLLOUT | POLLWRNORM; - - if (!list_empty(&homa_sk(sk)->ready_requests) || - !list_empty(&homa_sk(sk)->ready_responses)) - mask |= POLLIN | POLLRDNORM; + tt_record2("homa_poll found sk_wmem_alloc %d, sk_sndbuf %d", + refcount_read(&hsk->sock.sk_wmem_alloc), + hsk->sock.sk_sndbuf); + if (homa_sock_wmem_avl(hsk)) + mask |= EPOLLOUT | EPOLLWRNORM; + else + set_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags); + + if (hsk->shutdown) + mask |= EPOLLIN; + + if (!list_empty(&hsk->ready_rpcs)) + mask |= EPOLLIN | EPOLLRDNORM; + tt_record1("homa_poll returning mask 0x%x", (__force int)mask); return mask; } -/** - * homa_metrics_open() - This function is invoked when /proc/net/homa_metrics is - * opened. - * @inode: The inode corresponding to the file. - * @file: Information about the open file. - * - * Return: always 0. - */ -int homa_metrics_open(struct inode *inode, struct file *file) -{ - /* Collect all of the metrics when the file is opened, and save - * these for use by subsequent reads (don't want the metrics to - * change between reads). If there are concurrent opens on the - * file, only read the metrics once, during the first open, and - * use this copy for subsequent opens, until the file has been - * completely closed. - */ - spin_lock(&homa->metrics_lock); - if (homa->metrics_active_opens == 0) { - homa_print_metrics(homa); - } - homa->metrics_active_opens++; - spin_unlock(&homa->metrics_lock); - return 0; -} - -/** - * homa_metrics_read() - This function is invoked to handle read kernel calls on - * /proc/net/homa_metrics. - * @file: Information about the file being read. - * @buffer: Address in user space of the buffer in which data from the file - * should be returned. - * @length: Number of bytes available at @buffer. - * @offset: Current read offset within the file. - * - * Return: the number of bytes returned at @buffer. 0 means the end of the - * file was reached, and a negative number indicates an error (-errno). - */ -ssize_t homa_metrics_read(struct file *file, char __user *buffer, - size_t length, loff_t *offset) -{ - size_t copied; - - if (*offset >= homa->metrics_length) - return 0; - copied = homa->metrics_length - *offset; - if (copied > length) - copied = length; - if (copy_to_user(buffer, homa->metrics + *offset, copied)) - return -EFAULT; - *offset += copied; - return copied; -} - - -/** - * homa_metrics_lseek() - This function is invoked to handle seeks on - * /proc/net/homa_metrics. Right now seeks are ignored: the file must be - * read sequentially. - * @file: Information about the file being read. - * @offset: Distance to seek, in bytes - * @whence: Starting point from which to measure the distance to seek. - */ -loff_t homa_metrics_lseek(struct file *file, loff_t offset, int whence) -{ - return 0; -} - -/** - * homa_metrics_release() - This function is invoked when the last reference to - * an open /proc/net/homa_metrics is closed. It performs cleanup. - * @inode: The inode corresponding to the file. - * @file: Information about the open file. - * - * Return: always 0. - */ -int homa_metrics_release(struct inode *inode, struct file *file) -{ - spin_lock(&homa->metrics_lock); - homa->metrics_active_opens--; - spin_unlock(&homa->metrics_lock); - return 0; -} - +#ifndef __STRIP__ /* See strip.py */ /** * homa_dointvec() - This function is a wrapper around proc_dointvec. It is * invoked to read and write sysctl values and also update other values @@ -1583,50 +1812,158 @@ int homa_metrics_release(struct inode *inode, struct file *file) * * Return: 0 for success, nonzero for error. */ -int homa_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int homa_dointvec(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { + struct homa *homa = homa_net(current->nsproxy->net_ns)->homa; + struct ctl_table table_copy; int result; - result = proc_dointvec(table, write, buffer, lenp, ppos); + + /* Generate a new ctl_table that refers to a field in the + * net-specific struct homa. + */ + table_copy = *table; + table_copy.data = ((char *)homa) + (uintptr_t)table_copy.data; + + result = proc_dointvec(&table_copy, write, buffer, lenp, ppos); if (write) { - /* Don't worry which particular value changed; update - * all info that is dependent on any sysctl value. + /* Update any information that is dependent on sysctl values + * (don't worry about which value changed, just refresh all + * dependent information). */ homa_incoming_sysctl_changed(homa); - homa_outgoing_sysctl_changed(homa); + homa_pacer_update_sysctl_deps(homa->pacer); + homa_qdisc_update_sysctl_deps(homa->qshared); /* For this value, only call the method when this * particular value was written (don't want to increment * cutoff_version otherwise). */ - if ((table->data == &homa_data.unsched_cutoffs) - || (table->data == &homa_data.num_priorities)) { + if (table_copy.data == &homa->unsched_cutoffs || + table_copy.data == &homa->num_priorities) { homa_prios_changed(homa); } - /* Handle the special value log_topic by invoking a function + if (homa->next_id != 0) { + atomic64_set(&homa->next_outgoing_id, homa->next_id); + homa->next_id = 0; + } + + /* Handle the special value "action" by invoking a function * to print information to the log. */ - if (table->data == &log_topic) { - if (log_topic == 1) - homa_log_grantable_list(homa); - else if (log_topic == 2) + if (table_copy.data == &homa->sysctl_action) { + if (homa->sysctl_action == 2) { homa_rpc_log_active(homa, 0); - else if (log_topic == 3) { + } else if (homa->sysctl_action == 3) { tt_record("Freezing because of sysctl"); tt_freeze(); - } else if (log_topic == 4) - homa_log_throttled(homa); - else if (log_topic == 5) + } else if (homa->sysctl_action == 4) { + homa_pacer_log_throttled(homa->pacer); + } else if (homa->sysctl_action == 5) { tt_printk(); - else - homa_rpc_log_active(homa, log_topic); - log_topic = 0; + } else if (homa->sysctl_action == 6) { + tt_record("Calling homa_rpc_log_active because of action 6"); + homa_rpc_log_active_tt(homa, 0); + tt_record("Freezing because of action 6"); + tt_freeze(); + } else if (homa->sysctl_action == 7) { + homa_rpc_log_active_tt(homa, 0); + tt_record("Freezing cluster because of action 7"); + homa_freeze_peers(); + tt_record("Finished freezing cluster"); + tt_freeze(); + } else if (homa->sysctl_action == 8) { + pr_notice("homa_total_incoming is %d\n", + atomic_read(&homa->grant->total_incoming)); + } else if (homa->sysctl_action == 9) { + homa_rpc_stats_log(); + } else if (homa->sysctl_action == 10) { + tt_unfreeze(); + } else { + homa_rpc_log_active(homa, homa->sysctl_action); + } + homa->sysctl_action = 0; } } return result; } +/** + * homa_sysctl_softirq_cores() - This function is invoked to handle sysctl + * requests for the "gen3_softirq_cores" target, which requires special + * processing. + * @table: sysctl table describing value to be read or written. + * @write: Nonzero means value is being written, 0 means read. + * @buffer: Address in user space of the input/output data. + * @lenp: Not exactly sure. + * @ppos: Not exactly sure. + * + * Return: 0 for success, nonzero for error. + */ +int homa_sysctl_softirq_cores(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct homa_offload_core *offload_core; + struct ctl_table table_copy; + int max_values, *values; + int result, i; + + max_values = (NUM_GEN3_SOFTIRQ_CORES + 1) * nr_cpu_ids; + values = kmalloc_array(max_values, sizeof(int), GFP_KERNEL); + if (!values) + return -ENOMEM; + + table_copy = *table; + table_copy.data = values; + if (write) { + /* First value is core id, others are contents of its + * gen3_softirq_cores. + */ + for (i = 0; i < max_values ; i++) + values[i] = -1; + table_copy.maxlen = max_values; + result = proc_dointvec(&table_copy, write, buffer, lenp, ppos); + if (result != 0) + goto done; + for (i = 0; i < max_values; + i += NUM_GEN3_SOFTIRQ_CORES + 1) { + int j; + + if (values[i] < 0) + break; + offload_core = &per_cpu(homa_offload_core, values[i]); + for (j = 0; j < NUM_GEN3_SOFTIRQ_CORES; j++) + offload_core->gen3_softirq_cores[j] = + values[i + j + 1]; + } + } else { + /* Read: return values from all of the cores. */ + int *dst; + + table_copy.maxlen = 0; + dst = values; + for (i = 0; i < nr_cpu_ids; i++) { + int j; + + *dst = i; + dst++; + table_copy.maxlen += sizeof(int); + offload_core = &per_cpu(homa_offload_core, i); + for (j = 0; j < NUM_GEN3_SOFTIRQ_CORES; j++) { + *dst = offload_core->gen3_softirq_cores[j]; + dst++; + table_copy.maxlen += sizeof(int); + } + } + result = proc_dointvec(&table_copy, write, buffer, lenp, ppos); + } +done: + kfree(values); + return result; +} +#endif /* See strip.py */ + /** * homa_hrtimer() - This function is invoked by the hrtimer mechanism to * wake up the timer thread. Runs at IRQ level. @@ -1642,29 +1979,34 @@ enum hrtimer_restart homa_hrtimer(struct hrtimer *timer) /** * homa_timer_main() - Top-level function for the timer thread. - * @transportInfo: Pointer to struct homa. + * @transport: Pointer to struct homa. * * Return: Always 0. */ -int homa_timer_main(void *transportInfo) +int homa_timer_main(void *transport) { - struct homa *homa = (struct homa *) transportInfo; - u64 nsec; + struct homa *homa = (struct homa *)transport; ktime_t tick_interval; - struct hrtimer hrtimer; + u64 nsec; +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 15, 0) hrtimer_init(&hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer.function = &homa_hrtimer; +#else + hrtimer_setup(&hrtimer, homa_hrtimer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); +#endif nsec = 1000000; /* 1 ms */ tick_interval = ns_to_ktime(nsec); while (1) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!exiting) { - hrtimer_start(&hrtimer, tick_interval, HRTIMER_MODE_REL); + if (!timer_thread_exit) { + hrtimer_start(&hrtimer, tick_interval, + HRTIMER_MODE_REL); schedule(); } __set_current_state(TASK_RUNNING); - if (exiting) + if (timer_thread_exit) break; homa_timer(homa); } @@ -1672,3 +2014,17 @@ int homa_timer_main(void *transportInfo) kthread_complete_and_exit(&timer_thread_done, 0); return 0; } + +#ifndef __UNIT_TEST__ +MODULE_LICENSE("Dual BSD/GPL"); +#endif /* __UNIT_TEST__ */ +MODULE_AUTHOR("John Ousterhout "); +MODULE_DESCRIPTION("Homa transport protocol"); +MODULE_VERSION("1.0"); + +/* Arrange for this module to be loaded automatically when a Homa socket is + * opened. Apparently symbols don't work in the macros below, so must use + * numeric values for IPPROTO_HOMA (146) and SOCK_DGRAM(2). + */ +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 146, 2); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 146, 2); diff --git a/homa_pool.c b/homa_pool.c index 0c588a3b..9bc6b33f 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -1,113 +1,174 @@ -/* Copyright (c) 2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" +#ifndef __STRIP__ /* See strip.py */ +#include "homa_grant.h" +#endif /* See strip.py */ +#include "homa_pool.h" -/* Pools must always have at least this many active pages. */ -#define MIN_ACTIVE 4 +/* This file contains functions that manage user-space buffer pools. */ -/* When running unit tests, allow HOMA_BPAGE_SIZE and HOMA_BPAGE_SHIFT - * to be overriden. +/* Pools must always have at least this many bpages (no particular + * reasoning behind this value). */ +#define MIN_POOL_SIZE 2 + +/* Used when determining how many bpages to consider for allocation. */ +#define MIN_EXTRA 4 + #ifdef __UNIT_TEST__ +/* When running unit tests, allow HOMA_BPAGE_SIZE and HOMA_BPAGE_SHIFT + * to be overridden. + */ #include "mock.h" #undef HOMA_BPAGE_SIZE #define HOMA_BPAGE_SIZE mock_bpage_size #undef HOMA_BPAGE_SHIFT #define HOMA_BPAGE_SHIFT mock_bpage_shift -#endif +#endif /* __UNIT_TEST__ */ /** - * homa_pool_init() - Initialize a homa_pool; any previous contents of the - * objects are overwritten. - * @pool: Pool to initialize. - * @homa Overall information about Homa. - * @region First byte of the memory region for the pool, allocated + * set_bpages_needed() - Set the bpages_needed field of @pool based + * on the length of the first RPC that's waiting for buffer space. + * The caller must own the lock for @pool->hsk. + * @pool: Pool to update. + */ +static void set_bpages_needed(struct homa_pool *pool) +{ + struct homa_rpc *rpc = list_first_entry(&pool->hsk->waiting_for_bufs, + struct homa_rpc, buf_links); + + pool->bpages_needed = (rpc->msgin.length + HOMA_BPAGE_SIZE - 1) >> + HOMA_BPAGE_SHIFT; +} + +/** + * homa_pool_alloc() - Allocate and initialize a new homa_pool (it will have + * no region associated with it until homa_pool_set_region is invoked). + * @hsk: Socket the pool will be associated with. + * Return: A pointer to the new pool or a negative errno. + */ +struct homa_pool *homa_pool_alloc(struct homa_sock *hsk) +{ + struct homa_pool *pool; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return ERR_PTR(-ENOMEM); + pool->hsk = hsk; + return pool; +} + +/** + * homa_pool_set_region() - Associate a region of memory with a pool. + * @hsk: Socket whose pool the region will be associated with. + * Must not be locked, and the pool must not currently + * have a region associated with it. + * @region: First byte of the memory region for the pool, allocated * by the application; must be page-aligned. - * @region_size Total number of bytes available at @buf_region. + * @region_size: Total number of bytes available at @buf_region. * Return: Either zero (for success) or a negative errno for failure. */ -int homa_pool_init(struct homa_pool *pool, struct homa *homa, - void *region, __u64 region_size) +int homa_pool_set_region(struct homa_sock *hsk, void __user *region, + u64 region_size) { - int i, result; + struct homa_pool_core __percpu *cores; + struct homa_bpage *descriptors; + int i, result, num_bpages; + struct homa_pool *pool; - if (((__u64) region) & ~PAGE_MASK) + if (((uintptr_t)region) & ~PAGE_MASK) return -EINVAL; - pool->cores = NULL; - pool->region = (char *) region; - pool->num_bpages = region_size >> HOMA_BPAGE_SHIFT; - if (pool->num_bpages < MIN_ACTIVE) { - result = -EINVAL; + + /* Allocate memory before locking the socket, so we can allocate + * without GFP_ATOMIC. + */ + num_bpages = region_size >> HOMA_BPAGE_SHIFT; + if (num_bpages < MIN_POOL_SIZE) + return -EINVAL; + descriptors = kmalloc_array(num_bpages, sizeof(struct homa_bpage), + GFP_KERNEL | __GFP_ZERO); + if (!descriptors) + return -ENOMEM; + cores = alloc_percpu_gfp(struct homa_pool_core, __GFP_ZERO); + if (!cores) { + result = -ENOMEM; goto error; } - pool->homa = homa; - pool->descriptors = (struct homa_bpage *) kmalloc( - pool->num_bpages * sizeof(struct homa_bpage), - GFP_ATOMIC); - if (!pool->descriptors) { - result = -ENOMEM; + + homa_sock_lock(hsk); + pool = hsk->buffer_pool; + if (pool->region) { + result = -EINVAL; + homa_sock_unlock(hsk); goto error; } + + pool->region = (char __user *)region; + pool->num_bpages = num_bpages; + pool->descriptors = descriptors; + atomic_set(&pool->free_bpages, pool->num_bpages); + pool->bpages_needed = INT_MAX; + pool->cores = cores; + pool->check_waiting_invoked = 0; + for (i = 0; i < pool->num_bpages; i++) { struct homa_bpage *bp = &pool->descriptors[i]; + spin_lock_init(&bp->lock); - atomic_set(&bp->refs, 0); bp->owner = -1; - bp->expiration = 0; - } - atomic_set(&pool->active_pages, MIN_ACTIVE); - atomic_set(&pool->next_scan, 0); - atomic_set(&pool->free_bpages_found, 0); - - /* Allocate and initialize core-specific data. */ - pool->cores = (struct homa_pool_core *) kmalloc(nr_cpu_ids * - sizeof(struct homa_pool_core), GFP_ATOMIC); - if (!pool->cores) { - result = -ENOMEM; - goto error; - } - pool->num_cores = nr_cpu_ids; - for (i = 0; i < pool->num_cores; i++) { - pool->cores[i].page_hint = 0; - pool->cores[i].allocated = 0; } + homa_sock_unlock(hsk); return 0; - error: - if (pool->descriptors) - kfree(pool->descriptors); - if (pool->cores) - kfree(pool->cores); - pool->region = NULL; +error: + kfree(descriptors); + free_percpu(cores); return result; } /** - * homa_pool_destroy() - Destructor for homa_pool. After this method - * returns, the object should not be used unless it has been reinitialized. + * homa_pool_free() - Destructor for homa_pool. After this method + * returns, the object should not be used (it will be freed here). * @pool: Pool to destroy. */ -void homa_pool_destroy(struct homa_pool *pool) +void homa_pool_free(struct homa_pool *pool) { - if (!pool->region) - return; - kfree(pool->descriptors); - kfree(pool->cores); - pool->region = NULL; + if (pool->region) { + kfree(pool->descriptors); + free_percpu(pool->cores); + pool->region = NULL; + } + kfree(pool); +} + +/** + * homa_pool_get_rcvbuf() - Return information needed to handle getsockopt + * for HOMA_SO_RCVBUF. + * @pool: Pool for which information is needed. + * @args: Store info here. + */ +void homa_pool_get_rcvbuf(struct homa_pool *pool, + struct homa_rcvbuf_args *args) +{ + args->start = (uintptr_t)pool->region; + args->length = pool->num_bpages << HOMA_BPAGE_SHIFT; +} + +/** + * homa_bpage_available() - Check whether a bpage is available for use. + * @bpage: Bpage to check + * @now: Current time (homa_clock() units) + * Return: True if the bpage is free or if it can be stolen, otherwise + * false. + */ +bool homa_bpage_available(struct homa_bpage *bpage, u64 now) +{ + int ref_count = atomic_read(&bpage->refs); + + return ref_count == 0 || (ref_count == 1 && bpage->owner >= 0 && + bpage->expiration <= now); } /** @@ -116,174 +177,234 @@ void homa_pool_destroy(struct homa_pool *pool) * @num_pages: Number of pages needed * @pages: The indices of the allocated pages are stored here; caller * must ensure this array is big enough. Reference counts have - * been set to 1 on all of these pages. + * been set to 1 on all of these pages (or 2 if set_owner + * was specified). * @set_owner: If nonzero, the current core is marked as owner of all * of the allocated pages (and the expiration time is also - * set). Otherwises the pages are left unowned. + * set). Otherwise the pages are left unowned. * Return: 0 for success, -1 if there wasn't enough free space in the pool. -*/ -int homa_pool_get_pages(struct homa_pool *pool, int num_pages, __u32 *pages, - int set_owner) + */ +int homa_pool_get_pages(struct homa_pool *pool, int num_pages, u32 *pages, + int set_owner) { + int core_num = smp_processor_id(); + struct homa_pool_core *core; + u64 now = homa_clock(); int alloced = 0; - __u64 now = get_cycles(); + int limit = 0; - int active = atomic_read(&pool->active_pages); - int i; + core = this_cpu_ptr(pool->cores); + if (atomic_sub_return(num_pages, &pool->free_bpages) < 0) { + atomic_add(num_pages, &pool->free_bpages); + return -1; + } - while (1) { - int cur = atomic_fetch_inc(&pool->next_scan); + /* Once we get to this point we know we will be able to find + * enough free pages; now we just have to find them. + */ + while (alloced != num_pages) { struct homa_bpage *bpage; + int cur; + + /* If we don't need to use all of the bpages in the pool, + * then try to use only the ones with low indexes. This + * will reduce the cache footprint for the pool by reusing + * a few bpages over and over. Specifically this code will + * not consider any candidate page whose index is >= limit. + * Limit is chosen to make sure there are a reasonable + * number of free pages in the range, so we won't have to + * check a huge number of pages. + */ + if (limit == 0) { + int extra; - if (cur >= active) { - int free = atomic_read(&pool->free_bpages_found); - if ((free == 0) && (active == pool->num_bpages)) { - break; - } - if (active > 4*free) { - /* < 25% of pages free; grow active pool. */ - active += num_pages - alloced; - if (active > pool->num_bpages) - active = pool->num_bpages; - atomic_set(&pool->active_pages, active); - } else if (2*free > active) { - /* > 50% of pages free; shrink active - * pool by 10%. - */ - active -= active/10; - atomic_set(&pool->active_pages, - (active >= MIN_ACTIVE) - ? active : MIN_ACTIVE); - } - if (cur >= active) { - atomic_set(&pool->free_bpages_found, 0); - atomic_set(&pool->next_scan, 0); - continue; - } + limit = pool->num_bpages - + atomic_read(&pool->free_bpages); + extra = limit >> 2; + limit += (extra < MIN_EXTRA) ? MIN_EXTRA : extra; + if (limit > pool->num_bpages) + limit = pool->num_bpages; } + cur = core->next_candidate; + core->next_candidate++; + if (cur >= limit) { + core->next_candidate = 0; + + /* Must recompute the limit for each new loop through + * the bpage array: we may need to consider a larger + * range of pages because of concurrent allocations. + */ + limit = 0; + continue; + } bpage = &pool->descriptors[cur]; - /* Don't lock the bpage unless there is some chance we can - * use it. */ - if (atomic_read(&bpage->refs) || ((bpage->owner >= 0) - && (bpage->expiration > now))) + + /* Figure out whether this candidate is free (or can be + * stolen). Do a quick check without locking the page, and + * if the page looks promising, then lock it and check again + * (must check again in case someone else snuck in and + * grabbed the page). + */ + if (!homa_bpage_available(bpage, now)) continue; if (!spin_trylock_bh(&bpage->lock)) + /* Rather than wait for a locked page to become free, + * just go on to the next page. If the page is locked, + * it probably won't turn out to be available anyway. + */ continue; - - /* Must recheck after acquiring the lock (another core - * could have snuck in and grabbed the bpage). - */ - if (atomic_read(&bpage->refs) || ((bpage->owner >= 0) - && (bpage->expiration > now))) { + if (!homa_bpage_available(bpage, now)) { spin_unlock_bh(&bpage->lock); continue; } - atomic_inc(&pool->free_bpages_found); - atomic_set(&bpage->refs, 1); + if (bpage->owner >= 0) + atomic_inc(&pool->free_bpages); if (set_owner) { - bpage->owner = raw_smp_processor_id(); - bpage->expiration = now + pool->homa->bpage_lease_cycles; - } else + atomic_set(&bpage->refs, 2); + bpage->owner = core_num; + bpage->expiration = now + + pool->hsk->homa->bpage_lease_cycles; + } else { + atomic_set(&bpage->refs, 1); bpage->owner = -1; + } spin_unlock_bh(&bpage->lock); pages[alloced] = cur; alloced++; - if (alloced == num_pages) - return 0; - } - - /* If we get here, it means we ran out of space in the pool. Free - * any pages already allocated. There's no need to lock the bpage - * before modifying it; the ref count provides sufficient protection. - */ - for (i = 0; i < alloced; i++) { - struct homa_bpage *bpage = &pool->descriptors[pages[i]]; - bpage->owner = -1; - atomic_set(&bpage->refs, 0); } - return -1; + return 0; } /** - * homa_pool_allocate() - Allocate buffer space for an RPC. + * homa_pool_alloc_msg() - Allocate buffer space for an incoming message. * @rpc: RPC that needs space allocated for its incoming message (space must * not already have been allocated). The fields @msgin->num_buffers - * and @msgin->buffers are filled in. - * Return: 0 for success, -1 if space could not be allocated. + * and @msgin->buffers are filled in. Must be locked by caller. + * Return: The return value is normally 0, which means either buffer space + * was allocated or the @rpc was queued on @hsk->waiting. If a fatal error + * occurred, such as no buffer pool present, then a negative errno is + * returned. */ -int homa_pool_allocate(struct homa_rpc *rpc) +int homa_pool_alloc_msg(struct homa_rpc *rpc) + __must_hold(rpc->bucket->lock) { - struct homa_pool *pool = &rpc->hsk->buffer_pool; + struct homa_pool *pool = rpc->hsk->buffer_pool; int full_pages, partial, i, core_id; - __u32 pages[HOMA_MAX_BPAGES]; struct homa_pool_core *core; + u32 pages[HOMA_MAX_BPAGES]; struct homa_bpage *bpage; - __u64 now = get_cycles(); + struct homa_rpc *other; if (!pool->region) - return -1; + return -ENOMEM; /* First allocate any full bpages that are needed. */ - full_pages = rpc->msgin.total_length >> HOMA_BPAGE_SHIFT; + full_pages = rpc->msgin.length >> HOMA_BPAGE_SHIFT; if (unlikely(full_pages)) { if (homa_pool_get_pages(pool, full_pages, pages, 0) != 0) - return -1; + goto out_of_space; for (i = 0; i < full_pages; i++) - rpc->msgin.bpage_offsets[i] = pages[i] << HOMA_BPAGE_SHIFT; + rpc->msgin.bpage_offsets[i] = pages[i] << + HOMA_BPAGE_SHIFT; } rpc->msgin.num_bpages = full_pages; /* The last chunk may be less than a full bpage; for this we use - * a bpage that we own (and reuse for multiple messages). + * the bpage that we own (and reuse it for multiple messages). */ - partial = rpc->msgin.total_length - (full_pages << HOMA_BPAGE_SHIFT); + partial = rpc->msgin.length & (HOMA_BPAGE_SIZE - 1); if (unlikely(partial == 0)) - return 0; - core_id = raw_smp_processor_id(); - core = &pool->cores[core_id]; + goto success; + core_id = smp_processor_id(); + core = this_cpu_ptr(pool->cores); bpage = &pool->descriptors[core->page_hint]; +#ifndef __STRIP__ /* See strip.py */ if (!spin_trylock_bh(&bpage->lock)) { - /* Someone else has the lock, which means they are stealing - * the bpage from us. Abandon it. - */ - goto new_page; + tt_record("beginning wait for bpage lock"); + spin_lock_bh(&bpage->lock); + tt_record("ending wait for bpage lock"); } +#else /* See strip.py */ + spin_lock_bh(&bpage->lock); +#endif /* See strip.py */ if (bpage->owner != core_id) { spin_unlock_bh(&bpage->lock); goto new_page; } if ((core->allocated + partial) > HOMA_BPAGE_SIZE) { - if (atomic_read(&bpage->refs) > 0) { +#ifndef __STRIP__ /* See strip.py */ + if (atomic_read(&bpage->refs) == 1) { + /* Bpage is totally free, so we can reuse it. */ + core->allocated = 0; + INC_METRIC(bpage_reuses, 1); +#else /* See strip.py */ + if (atomic_read(&bpage->refs) == 1) { + /* Bpage is totally free, so we can reuse it. */ + core->allocated = 0; +#endif /* See strip.py */ + } else { bpage->owner = -1; + + /* We know the reference count can't reach zero here + * because of check above, so we won't have to decrement + * pool->free_bpages. + */ + atomic_dec_return(&bpage->refs); spin_unlock_bh(&bpage->lock); goto new_page; } - /* Bpage is totally free, so we can reuse it. */ - core->allocated = 0; - INC_METRIC(bpage_reuses, 1); } - bpage->expiration = now + pool->homa->bpage_lease_cycles; + bpage->expiration = homa_clock() + + pool->hsk->homa->bpage_lease_cycles; atomic_inc(&bpage->refs); spin_unlock_bh(&bpage->lock); goto allocate_partial; /* Can't use the current page; get another one. */ - new_page: +new_page: if (homa_pool_get_pages(pool, 1, pages, 1) != 0) { homa_pool_release_buffers(pool, rpc->msgin.num_bpages, - rpc->msgin.bpage_offsets); + rpc->msgin.bpage_offsets); rpc->msgin.num_bpages = 0; - return -1; + goto out_of_space; } core->page_hint = pages[0]; core->allocated = 0; - allocate_partial: +allocate_partial: rpc->msgin.bpage_offsets[rpc->msgin.num_bpages] = core->allocated + (core->page_hint << HOMA_BPAGE_SHIFT); rpc->msgin.num_bpages++; core->allocated += partial; + +success: + tt_record4("Allocated %d bpage pointers on port %d for id %d, free_bpages now %d", + rpc->msgin.num_bpages, pool->hsk->port, rpc->id, + atomic_read(&pool->free_bpages)); + return 0; + + /* We get here if there wasn't enough buffer space for this + * message; add the RPC to hsk->waiting_for_bufs. The list is sorted + * by RPC length in order to implement SRPT. + */ +out_of_space: + INC_METRIC(buffer_alloc_failures, 1); + tt_record4("Buffer allocation failed, port %d, id %d, length %d, free_bpages %d", + pool->hsk->port, rpc->id, rpc->msgin.length, + atomic_read(&pool->free_bpages)); + homa_sock_lock(pool->hsk); + list_for_each_entry(other, &pool->hsk->waiting_for_bufs, buf_links) { + if (other->msgin.length > rpc->msgin.length) { + list_add_tail(&rpc->buf_links, &other->buf_links); + goto queued; + } + } + list_add_tail(&rpc->buf_links, &pool->hsk->waiting_for_bufs); + +queued: + set_bpages_needed(pool); + homa_sock_unlock(pool->hsk); return 0; } @@ -291,49 +412,161 @@ int homa_pool_allocate(struct homa_rpc *rpc) * homa_pool_get_buffer() - Given an RPC, figure out where to store incoming * message data. * @rpc: RPC for which incoming message data is being processed; its - * msgin must be properly initialized. + * msgin must be properly initialized and buffer space must have + * been allocated for the message. * @offset: Offset within @rpc's incoming message. * @available: Will be filled in with the number of bytes of space available - * at the returned address. + * at the returned address (could be zero if offset is + * (erroneously) past the end of the message). * Return: The application's virtual address for buffer space corresponding - * to @offset in the incoming message for @rpc. 0 is returned if - * buffer space could not be allocated. + * to @offset in the incoming message for @rpc. */ -void *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, int *available) +void __user *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, + int *available) { int bpage_index, bpage_offset; - if (rpc->msgin.num_bpages == 0) - if (homa_pool_allocate(rpc) != 0) - return NULL; bpage_index = offset >> HOMA_BPAGE_SHIFT; - BUG_ON(bpage_index >= rpc->msgin.num_bpages); - bpage_offset = offset & (HOMA_BPAGE_SIZE-1); - *available = (bpage_index < (rpc->msgin.num_bpages-1)) + if (offset >= rpc->msgin.length) { + WARN_ONCE(true, "%s got offset %d >= message length %d\n", + __func__, offset, rpc->msgin.length); + *available = 0; + return NULL; + } + bpage_offset = offset & (HOMA_BPAGE_SIZE - 1); + *available = (bpage_index < (rpc->msgin.num_bpages - 1)) ? HOMA_BPAGE_SIZE - bpage_offset - : rpc->msgin.total_length - offset; - return rpc->hsk->buffer_pool.region + rpc->msgin.bpage_offsets[bpage_index] - + bpage_offset; + : rpc->msgin.length - offset; + return rpc->hsk->buffer_pool->region + + rpc->msgin.bpage_offsets[bpage_index] + bpage_offset; } /** * homa_pool_release_buffers() - Release buffer space so that it can be - * reused. This method may be invoked without holding any locks. - * @pool: Pool that the buffer space belongs to. + * reused. + * @pool: Pool that the buffer space belongs to. Doesn't need to + * be locked. * @num_buffers: How many buffers to release. * @buffers: Points to @num_buffers values, each of which is an offset * from the start of the pool to the buffer to be released. + * Return: 0 for success, otherwise a negative errno. */ -void homa_pool_release_buffers(struct homa_pool *pool, int num_buffers, - __u32 *buffers) +int homa_pool_release_buffers(struct homa_pool *pool, int num_buffers, + u32 *buffers) { + int result = 0; int i; if (!pool->region) - return; + return result; for (i = 0; i < num_buffers; i++) { - __u32 bpage_index = buffers[i] >> HOMA_BPAGE_SHIFT; - if (bpage_index < pool->num_bpages) - atomic_dec(&pool->descriptors[bpage_index].refs); + u32 bpage_index = buffers[i] >> HOMA_BPAGE_SHIFT; + struct homa_bpage *bpage = &pool->descriptors[bpage_index]; + + if (bpage_index < pool->num_bpages) { + if (atomic_dec_return(&bpage->refs) == 0) + atomic_inc(&pool->free_bpages); + } else { + result = -EINVAL; + } + } + tt_record3("Released %d bpages, free_bpages for port %d now %d", + num_buffers, pool->hsk->port, + atomic_read(&pool->free_bpages)); + return result; +} + +/** + * homa_pool_check_waiting() - Checks to see if there are enough free + * bpages to wake up any RPCs that were blocked. Whenever + * homa_pool_release_buffers is invoked, this function must be invoked later, + * at a point when the caller holds no locks (homa_pool_release_buffers may + * be invoked with locks held, so it can't safely invoke this function). + * This is regrettably tricky, but I can't think of a better solution. + * @pool: Information about the buffer pool. + */ +void homa_pool_check_waiting(struct homa_pool *pool) +{ +#ifdef __UNIT_TEST__ + pool->check_waiting_invoked += 1; +#endif /* __UNIT_TEST__ */ + if (!pool->region) + return; + while (atomic_read(&pool->free_bpages) >= pool->bpages_needed) { + struct homa_rpc *rpc; + + homa_sock_lock(pool->hsk); + if (list_empty(&pool->hsk->waiting_for_bufs)) { + pool->bpages_needed = INT_MAX; + homa_sock_unlock(pool->hsk); + break; + } + rpc = list_first_entry(&pool->hsk->waiting_for_bufs, + struct homa_rpc, buf_links); + if (!homa_rpc_try_lock(rpc)) { + /* Can't just spin on the RPC lock because we're + * holding the socket lock and the lock order is + * rpc-then-socket (see "Homa Locking Strategy" in + * homa_impl.h). Instead, release the socket lock + * and try the entire operation again. + */ + homa_sock_unlock(pool->hsk); + UNIT_LOG("; ", "rpc lock unavailable in %s", __func__); + continue; + } + list_del_init(&rpc->buf_links); + if (list_empty(&pool->hsk->waiting_for_bufs)) + pool->bpages_needed = INT_MAX; + else + set_bpages_needed(pool); + homa_sock_unlock(pool->hsk); + tt_record4("Retrying buffer allocation for id %d, length %d, free_bpages %d, new bpages_needed %d", + rpc->id, rpc->msgin.length, + atomic_read(&pool->free_bpages), + pool->bpages_needed); + homa_pool_alloc_msg(rpc); +#ifndef __STRIP__ /* See strip.py */ + if (rpc->msgin.num_bpages > 0) { + struct homa_resend_hdr resend; + + /* To "wake up" the RPC, request retransmission of + * all the packets that were dropped. Use the + * next-to-highest priority level to provide a priority + * boost without interfering with the highest priority + * traffic such as control packets. + */ + resend.offset = htonl(0); + resend.length = htonl(-1); + resend.priority = homa_high_priority(rpc->hsk->homa); + homa_xmit_control(RESEND, &resend, sizeof(resend), rpc); + if (rpc->msgin.granted < rpc->msgin.length) + homa_grant_manage_rpc(rpc); + } +#endif /* See strip.py */ + homa_rpc_unlock(rpc); + } +} + +/** + * homa_pool_avail_bytes() - Return a count of the number of bytes currently + * unused and available for allocation in a pool. + * @pool: Pool of interest. + * Return: See above. + */ +u64 homa_pool_avail_bytes(struct homa_pool *pool) +{ + struct homa_pool_core *core; + u64 avail; + int cpu; + + if (!pool->region) + return 0; + avail = atomic_read(&pool->free_bpages); + avail *= HOMA_BPAGE_SIZE; + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { + core = per_cpu_ptr(pool->cores, cpu); + if (pool->descriptors[core->page_hint].owner == cpu) + avail += HOMA_BPAGE_SIZE - core->allocated; } -} \ No newline at end of file + return avail; +} diff --git a/homa_pool.h b/homa_pool.h new file mode 100644 index 00000000..1f545566 --- /dev/null +++ b/homa_pool.h @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ + +/* This file contains definitions used to manage user-space buffer pools. + */ + +#ifndef _HOMA_POOL_H +#define _HOMA_POOL_H + +#include + +#include "homa_rpc.h" + +/** + * struct homa_bpage - Contains information about a single page in + * a buffer pool. + */ +struct homa_bpage { + /** @lock: to synchronize shared access. */ + spinlock_t lock; + + /** + * @refs: Counts number of distinct uses of this + * bpage (1 tick for each message that is using + * this page, plus an additional tick if the @owner + * field is set). + */ + atomic_t refs; + + /** + * @owner: kernel core that currently owns this page + * (< 0 if none). + */ + int owner; + + /** + * @expiration: homa_clock() time after which it's OK to steal this + * page from its current owner (if @refs is 1). + */ + u64 expiration; +} ____cacheline_aligned_in_smp; + +/** + * struct homa_pool_core - Holds core-specific data for a homa_pool (a bpage + * out of which that core is allocating small chunks). + */ +struct homa_pool_core { + /** + * @page_hint: Index of bpage in pool->descriptors, + * which may be owned by this core. If so, we'll use it + * for allocating partial pages. + */ + int page_hint; + + /** + * @allocated: if the page given by @page_hint is + * owned by this core, this variable gives the number of + * (initial) bytes that have already been allocated + * from the page. + */ + int allocated; + + /** + * @next_candidate: when searching for free bpages, + * check this index next. + */ + int next_candidate; +}; + +/** + * struct homa_pool - Describes a pool of buffer space for incoming + * messages for a particular socket; managed by homa_pool.c. The pool is + * divided up into "bpages", which are a multiple of the hardware page size. + * A bpage may be owned by a particular core so that it can more efficiently + * allocate space for small messages. + */ +struct homa_pool { + /** + * @hsk: the socket that this pool belongs to. + */ + struct homa_sock *hsk; + + /** + * @region: beginning of the pool's region (in the app's virtual + * memory). Divided into bpages. 0 means the pool hasn't yet been + * initialized. + */ + char __user *region; + + /** @num_bpages: total number of bpages in the pool. */ + int num_bpages; + + /** @descriptors: kmalloced area containing one entry for each bpage. */ + struct homa_bpage *descriptors; + + /** + * @free_bpages: the number of pages still available for allocation + * by homa_pool_get pages. This equals the number of pages with zero + * reference counts, minus the number of pages that have been claimed + * by homa_get_pool_pages but not yet allocated. + */ + atomic_t free_bpages; + + /** + * @bpages_needed: the number of free bpages required to satisfy the + * needs of the first RPC on @hsk->waiting_for_bufs, or INT_MAX if + * that queue is empty. + */ + int bpages_needed; + + /** @cores: core-specific info; dynamically allocated. */ + struct homa_pool_core __percpu *cores; + + /** + * @check_waiting_invoked: incremented during unit tests when + * homa_pool_check_waiting is invoked. + */ + int check_waiting_invoked; +}; + +bool homa_bpage_available(struct homa_bpage *bpage, u64 now); +struct homa_pool *homa_pool_alloc(struct homa_sock *hsk); +int homa_pool_alloc_msg(struct homa_rpc *rpc); +u64 homa_pool_avail_bytes(struct homa_pool *pool); +void homa_pool_check_waiting(struct homa_pool *pool); +void homa_pool_free(struct homa_pool *pool); +void __user *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, + int *available); +int homa_pool_get_pages(struct homa_pool *pool, int num_pages, + u32 *pages, int leave_locked); +void homa_pool_get_rcvbuf(struct homa_pool *pool, + struct homa_rcvbuf_args *args); +int homa_pool_release_buffers(struct homa_pool *pool, + int num_buffers, u32 *buffers); +int homa_pool_set_region(struct homa_sock *hsk, void __user *region, + u64 region_size); + +#endif /* _HOMA_POOL_H */ diff --git a/homa_qdisc.c b/homa_qdisc.c new file mode 100755 index 00000000..9fa07e64 --- /dev/null +++ b/homa_qdisc.c @@ -0,0 +1,1281 @@ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +/* This file implements a special-purpose queuing discipline for Homa. + * This queuing discipline serves the following purposes: + * - It paces output traffic so that queues do not build up in the NIC + * (they build up here instead). + * - It implements the SRPT policy for Homa traffic (highest priority goes + * to the message with the fewest bytes remaining to transmit). + * - It manages TCP traffic as well as Homa traffic, so that TCP doesn't + * create long NIC queues. + * - When queues do build up, it balances output traffic between Homa and TCP. + */ + +/* PACING: + * + * Preventing congestion in the NIC is essential for a proper implementation + * of SRPT (otherwise a short message could get stuck behind a long message + * in the NIC). This file implements a two-part strategy: + * + * First, it paces output traffic so that packets are passed to the NIC at + * a data rate no more than the uplink bandwidth. It implements this by + * keeping a variable qdev->link_idle_time, which is an estimate of when + * the NIC will have finished transmitting all data that has been passed to + * it (assuming transmission at full link speed). If this time gets too far + * into the future (determined by the max_nic_est_backlog_usecs sysctl + * variable) then Homa stops handing off packets to the NIC until link_idle_time + * is no longer too far in the future. + * + * Unfortunately, this technique is not adequate by itself because NICs + * cannot always transmit at full link bandwidth; for example, measurements + * of Intel NICs in December 2025 showed NIC output as low as 80% of link + * bandwidth even with a large backlog of (mixed-size) output packets. As a + * result, with this approach alone NIC queues frequently build up + * (measurements showed total NIC backlogs of 5 MB or more under high + * network load, even with DQL). If the pacing rate is reduced to a level + * where the NIC could always keep up, it would sacrifice link bandwidth in + * situations where the NIC can transmit at closer to line rate. + * + * Thus Homa also uses a second approach, which is based on information + * maintained by the dynamic queue limits mechanism (DQL). DQL keeps + * counters for each netdev_queue that indicate how many bytes are in the + * NIC's possession for each queue (i.e. packets that have been passed + * to the NIC but not yet returned after transmission). If the number of + * outstanding bytes for any queue exceeds a limit (determined by the + * max_nic_queue_usecs sysctl parameter) then the NIC is considered + * congested and Homa will stop queuing more packets until the congestion + * subsides. This reduces worst-case total NIC queuing by 2-3x (as of + * January 2026). + * + * It might seem that the second approach is sufficient by itself, so the + * first approach is not needed. Unfortunately, updates to the DQL counters + * don't happen until packets are actually transmitted. This means that a + * a large burst of packets could pass through the qdisc mechanism before the + * DQL counters are updated, resulting in significant queue buildup before + * the counters get updated. The first technique prevents this from + * happening. + * + * There is one additional twist, which is that the rate limits above do + * not apply to small packets. The reasons for this are explained in a comment + * in homa_qdisc_enqueue. + * + * In case you're wondering "why don't you just use DQL?", the DQL mechanism + * is inadequate in two ways. First, it allows large queues to accumulate in + * the NIC. Second, when queues build up, Homa wants to know so it can + * throttle long messages more than short ones. DQL provides no feedback + * to qdiscs; it simply stops the entire output queue, throttling short and + * long messages alike. This interferes with Homa's SRPT scheduler. + */ + +#include "homa_impl.h" +#include "homa_qdisc.h" +#include "homa_rpc.h" +#include "timetrace.h" + +#include + +/* Used to enable sysctl access to configuration parameters related to + * homa_qdisc. The @data fields are actually offsets within a struct + * homa_qdisc_shared; these are converted to pointers into a net-specific + * struct homa later. + */ +#define OFFSET(field) ((void *)offsetof(struct homa_qdisc_shared, field)) +static struct ctl_table homa_qdisc_ctl_table[] = { + { + .procname = "max_nic_est_backlog_usecs", + .data = OFFSET(max_nic_est_backlog_usecs), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_qdisc_dointvec + }, + { + .procname = "max_nic_queue_usecs", + .data = OFFSET(max_nic_queue_usecs), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_qdisc_dointvec + }, + { + .procname = "pacer_fifo_fraction", + .data = OFFSET(fifo_fraction), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_qdisc_dointvec + }, + { + .procname = "defer_min_bytes", + .data = OFFSET(defer_min_bytes), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_qdisc_dointvec + }, + { + .procname = "homa_share", + .data = OFFSET(homa_share), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_qdisc_dointvec + }, + { + .procname = "max_link_usage", + .data = OFFSET(max_link_usage), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_qdisc_dointvec + }, +}; + +static struct Qdisc_ops homa_qdisc_ops __read_mostly = { + .id = "homa", + .priv_size = sizeof(struct homa_qdisc), + .enqueue = homa_qdisc_enqueue, + .dequeue = qdisc_dequeue_head, + .peek = qdisc_peek_head, + .init = homa_qdisc_init, + .reset = qdisc_reset_queue, + .destroy = homa_qdisc_destroy, + .owner = THIS_MODULE, +}; + +/** + * homa_qdisc_register() - Invoked when the Homa module is loaded; makes + * the homa qdisc known to Linux. + * Return: 0 for success or a negative errno if an error occurred. + */ +int homa_qdisc_register(void) +{ + return register_qdisc(&homa_qdisc_ops); +} + +/** + * homa_qdisc_unregister() - Invoked when the Homa module is about to be + * unloaded: deletes all information related to the homa qdisc. + */ +void homa_qdisc_unregister(void) +{ + unregister_qdisc(&homa_qdisc_ops); +} + +/** + * homa_rcu_kfree() - Call kfree on a block of memory when it is safe to + * do so from an RCU standpoint. If possible, the freeing is done + * asynchronously. + * @object: Eventually invoke kfree on this. + */ +void homa_rcu_kfree(void *object) +{ + struct homa_rcu_kfreer *freer; + + freer = kmalloc(sizeof(*freer), GFP_KERNEL); + if (!freer) { + /* Can't allocate memory needed for asynchronous freeing, + * so free synchronously. + */ + UNIT_LOG("; ", "homa_rcu_kfree kmalloc failed"); + synchronize_rcu(); + kfree(object); + } else { + freer->object = object; + call_rcu(&freer->rcu_head, homa_rcu_kfree_callback); + } +} + +/** + * homa_rcu_kfree_callback() - This function is invoked by the RCU subsystem + * when it safe to free an object previously passed to homa_rcu_kfree. + * @head: Points to the rcu_head member of a struct homa_rcu_kfreer. + */ +void homa_rcu_kfree_callback(struct rcu_head *head) +{ + struct homa_rcu_kfreer *freer; + + freer = container_of(head, struct homa_rcu_kfreer, rcu_head); + kfree(freer->object); + kfree(freer); +} + +/** + * homa_qdisc_shared_alloc() - Allocate and initialize a new homa_qdisc_shared + * object. + * Return: The new object, or an ERR_PTR if an error occurred. + */ +struct homa_qdisc_shared *homa_qdisc_shared_alloc(void) +{ + struct homa_qdisc_shared *qshared; + + qshared = kzalloc(sizeof(*qshared), GFP_KERNEL); + if (!qshared) + return ERR_PTR(-ENOMEM); + + mutex_init(&qshared->mutex); + INIT_LIST_HEAD(&qshared->qdevs); + qshared->fifo_fraction = 50; + qshared->max_nic_est_backlog_usecs = 5; + qshared->max_nic_queue_usecs = 20; + qshared->defer_min_bytes = 1000; + qshared->homa_share = 50; + qshared->max_link_usage = 99; + qshared->sysctl_header = register_net_sysctl(&init_net, "net/homa", + homa_qdisc_ctl_table); + if (!qshared->sysctl_header) { + pr_err("couldn't register sysctl parameters for Homa qdisc\n"); + kfree(qshared); + return ERR_PTR(-ENOMEM); + } + homa_qdisc_update_sysctl_deps(qshared); + return qshared; +} + +/** + * homa_qdisc_shared_free() - Invoked when a struct homa is being freed; + * releases information related to all the associated homa_qdiscs. + * @qshared: Information about homa_qdisc_devs associated with a + * particular struct homa. + */ +void homa_qdisc_shared_free(struct homa_qdisc_shared *qshared) +{ + struct homa_qdisc_dev *qdev; + int stranded = 0; + + /* At this point no-one else besides us should ever access this object + * again, but lock it just to be safe. + */ + mutex_lock(&qshared->mutex); + while (1) { + qdev = list_first_or_null_rcu(&qshared->qdevs, + struct homa_qdisc_dev, links); + if (!qdev) + break; + + /* This code should never execute (all the qdevs should + * already have been deleted). We can't safely free the + * stranded qdevs, but at least stop their pacer threads to + * reduce the likelihood of dereferencing dangling pointers. + */ + stranded++; + list_del_rcu(&qdev->links); + INIT_LIST_HEAD(&qdev->links); + kthread_stop(qdev->pacer_kthread); + qdev->pacer_kthread = NULL; + } + if (stranded != 0) + pr_err("homa_qdisc_devs_free found %d live qdevs (should have been none)\n", + stranded); + + if (qshared->sysctl_header) { + unregister_net_sysctl_table(qshared->sysctl_header); + qshared->sysctl_header = NULL; + } + mutex_unlock(&qshared->mutex); + homa_rcu_kfree(qshared); +} + +/** + * homa_qdisc_qdev_get() - Find the homa_qdisc_dev to use for a particular + * net_device and increment its reference count. Create a new one if there + * isn't an existing one to use. Do this in an RCU-safe fashion. + * @dev: NIC that the homa_qdisc_dev will manage. + * Return: A pointer to the new homa_qdisc_dev, or a PTR_ERR errno. + */ +struct homa_qdisc_dev *homa_qdisc_qdev_get(struct net_device *dev) +{ + struct homa_qdisc_shared *qshared; + struct homa_qdisc_dev *qdev; + struct homa_net *hnet; + + rcu_read_lock(); + hnet = homa_net(dev_net(dev)); + qshared = hnet->homa->qshared; + list_for_each_entry_rcu(qdev, &qshared->qdevs, links) { + if (qdev->dev == dev && refcount_inc_not_zero(&qdev->refs)) { + rcu_read_unlock(); + return qdev; + } + } + rcu_read_unlock(); + + /* Must allocate a new homa_qdisc_dev (but must check again, + * after acquiring the mutex, in case someone else already + * created it). + */ + mutex_lock(&qshared->mutex); + list_for_each_entry_rcu(qdev, &qshared->qdevs, links) { + if (qdev->dev == dev && refcount_inc_not_zero(&qdev->refs)) { + UNIT_LOG("; ", "race in homa_qdisc_qdev_get"); + goto done; + } + } + + qdev = kzalloc(sizeof(*qdev), GFP_KERNEL); + if (!qdev) { + qdev = ERR_PTR(-ENOMEM); + goto done; + } + qdev->dev = dev; + qdev->hnet = hnet; + refcount_set(&qdev->refs, 1); + homa_qdev_update_sysctl(qdev); + INIT_LIST_HEAD(&qdev->links); + qdev->deferred_rpcs = RB_ROOT_CACHED; + INIT_LIST_HEAD(&qdev->deferred_qdiscs); + qdev->next_qdisc = &qdev->deferred_qdiscs; + spin_lock_init(&qdev->defer_lock); + init_waitqueue_head(&qdev->pacer_sleep); + spin_lock_init(&qdev->pacer_mutex); + + qdev->pacer_kthread = kthread_run(homa_qdisc_pacer_main, qdev, + "homa_qdisc_pacer"); + if (IS_ERR(qdev->pacer_kthread)) { + int error = PTR_ERR(qdev->pacer_kthread); + + pr_err("couldn't create homa qdisc pacer thread: error %d\n", + error); + kfree(qdev); + qdev = ERR_PTR(error); + goto done; + } + list_add_rcu(&qdev->links, &qshared->qdevs); + +done: + mutex_unlock(&qshared->mutex); + return qdev; +} + +/** + * homa_qdisc_qdev_put() - Decrement the reference count for a homa_qdisc_qdev + * and free it if the count becomes zero. + * @qdev: Object to unreference. + */ +void homa_qdisc_qdev_put(struct homa_qdisc_dev *qdev) +{ + struct homa_qdisc_shared *qshared; + + if (!refcount_dec_and_test(&qdev->refs)) + return; + + /* Make this homa_qdisc_dev inaccessible, then schedule an RCU-safe + * free. Think carefully before you modify this code, to ensure that + * concurrent RCU scans of qshared->qdevs are safe. + */ + qshared = qdev->hnet->homa->qshared; + mutex_lock(&qshared->mutex); + list_del_rcu(&qdev->links); + kthread_stop(qdev->pacer_kthread); + qdev->pacer_kthread = NULL; + call_rcu(&qdev->rcu_head, homa_qdisc_dev_callback); + mutex_unlock(&qshared->mutex); +} + +/** + * homa_qdisc_dev_callback() - Invoked by the RCU subsystem when it is + * safe to finish deleting a homa_qdisc_dev. + * @head: Pointer to the rcu_head field in a homa_qdisc_qdev. + */ +void homa_qdisc_dev_callback(struct rcu_head *head) +{ + struct homa_qdisc_dev *qdev; + + qdev = container_of(head, struct homa_qdisc_dev, rcu_head); + homa_qdisc_free_homa(qdev); + WARN_ON(!list_empty(&qdev->deferred_qdiscs)); + kfree(qdev); +} + +/** + * homa_qdisc_init() - Initialize a new instance of this queuing discipline. + * @sch: Qdisc to initialize. + * @opt: Options for this qdisc; not currently used. + * @extack: For reporting detailed information relating to errors; not used. + * Return: 0 for success, otherwise a negative errno. + */ +int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct homa_qdisc *q = qdisc_priv(sch); + struct homa_qdisc_dev *qdev; + int i; + + qdev = homa_qdisc_qdev_get(sch->dev_queue->dev); + if (IS_ERR(qdev)) + return PTR_ERR(qdev); + + q->qdisc = sch; + q->qdev = qdev; + q->ix = -1; + for (i = 0; i < qdev->dev->num_tx_queues; i++) { + if (netdev_get_tx_queue(qdev->dev, i) == sch->dev_queue) { + q->ix = i; + break; + } + } + skb_queue_head_init(&q->deferred_tcp); + INIT_LIST_HEAD(&q->defer_links); + + sch->limit = 10 * 1024; + return 0; +} + +/** + * homa_qdisc_destroy() - This function is invoked to perform final cleanup + * before a qdisc is deleted. + * @qdisc: Qdisc that is being deleted. + */ +void homa_qdisc_destroy(struct Qdisc *qdisc) +{ + struct homa_qdisc *q = qdisc_priv(qdisc); + + qdisc_reset_queue(qdisc); + + spin_lock_bh(&q->qdev->defer_lock); + while (!skb_queue_empty(&q->deferred_tcp)) + kfree_skb_reason(__skb_dequeue(&q->deferred_tcp), + SKB_DROP_REASON_QDISC_DROP); + list_del_init(&q->defer_links); + if (q->qdev->congested_qdisc == q) + q->qdev->congested_qdisc = NULL; + spin_unlock_bh(&q->qdev->defer_lock); + homa_qdisc_qdev_put(q->qdev); +} + +/** + * homa_qdisc_enqueue() - Invoked when a new packet becomes available for + * transmission; this function determines whether to send it immediately + * or defer it until the NIC queue subsides. + * @skb: Packet to eventually transmit. + * @sch: Qdisc via which to transmit @skb. + * @to_free: Used when dropping packets. + */ +int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct homa_qdisc *q = qdisc_priv(sch); + struct homa_qdisc_dev *qdev = q->qdev; + struct homa_qdisc_shared *qshared; + struct homa_data_hdr *h; + int pkt_len; + int offset; + + homa_qdisc_update_congested(q); + + /* This function tries to transmit short packets immediately for both + * Homa and TCP, even when the NIC queue is long. We do this because + * (a) it reduces tail latency significantly for short packets, + * (b) there is no way to generate enough short packets to cause NIC + * queue buildup, and (c) the pacer's single thread doesn't have + * enough throughput to handle all the short packets at high load + * (whereas processing here happens concurrently on multiple cores). + */ + qshared = qdev->hnet->homa->qshared; + pkt_len = qdisc_pkt_len(skb); + if (!is_homa_pkt(skb)) { + /* This is a TCP packet (or something else other than Homa). + * Defer short TCP packets only if they are in the same flow + * as a previously deferred packet for this qdisc. + */ + INC_METRIC(qdisc_tcp_packets, 1); + if (pkt_len < qshared->defer_min_bytes) { + if (skb_queue_empty(&q->deferred_tcp) || + homa_qdisc_can_bypass(skb, q)) { + homa_qdisc_update_link_idle(qdev, pkt_len, -1); + goto enqueue; + } + homa_qdisc_defer_tcp(q, skb); + return NET_XMIT_SUCCESS; + } + if (!READ_ONCE(qdev->congested_qdisc) && + !homa_qdisc_any_deferred(qdev) && + homa_qdisc_update_link_idle(qdev, pkt_len, + qshared->max_nic_est_backlog_cycles)) + goto enqueue; + homa_qdisc_defer_tcp(q, skb); + return NET_XMIT_SUCCESS; + } + + /* For Homa packets it's important to use message length, not packet + * length when deciding whether to bypass the pacer. If packet + * length were used, then the short packet at the end of a long + * message might be transmitted when all the earlier packets in the + * message have been deferred, and the deferred packets might not be + * transmitted for a long time due to SRPT. In the meantime, the + * receiver will have reserved incoming for those packets. These + * reservations can pile up to the point where the receiver can't + * issue any grants, even though the "incoming" data isn't going to + * be transmitted anytime soon. + */ + h = (struct homa_data_hdr *)skb_transport_header(skb); + offset = homa_get_offset(h); + if (h->common.type != DATA || ntohl(h->message_length) < + qshared->defer_min_bytes) { + homa_qdisc_update_link_idle(qdev, pkt_len, -1); + goto enqueue; + } + + if (!READ_ONCE(qdev->congested_qdisc) && + !homa_qdisc_any_deferred(qdev) && + homa_qdisc_update_link_idle(qdev, pkt_len, + qshared->max_nic_est_backlog_cycles)) + goto enqueue; + + /* This packet needs to be deferred until the NIC queue has + * been drained a bit. + */ + tt_record3("homa_qdisc_enqueue deferring homa data packet for id %d, offset %d on qid %d", + be64_to_cpu(h->common.sender_id), offset, q->ix); + homa_qdisc_defer_homa(qdev, skb); + return NET_XMIT_SUCCESS; + +enqueue: + if (is_homa_pkt(skb)) { + if (h->common.type == DATA) { + h = (struct homa_data_hdr *)skb_transport_header(skb); + tt_record3("homa_qdisc_enqueue queuing homa data packet for id %d, offset %d on qid %d", + be64_to_cpu(h->common.sender_id), offset, + q->ix); + } + } else { + tt_record1("homa_qdisc_enqueue queuing non-homa packet, qid %d", + q->ix); + } + if (unlikely(sch->q.qlen >= READ_ONCE(sch->limit))) + return qdisc_drop(skb, sch, to_free); + return qdisc_enqueue_tail(skb, sch); +} + +/** + * homa_qdisc_can_bypass() - Determine whether it is OK to transmit a given + * TCP packet before those already deferred for a qdisc. + * @q: New packet + * @q: Qdisc with deferred TCP packets + * Return: True if skb can be transmitted before the packets in @list + * without violating reordering rules. + */ +bool homa_qdisc_can_bypass(struct sk_buff *skb, struct homa_qdisc *q) +{ + struct sk_buff *skb2; + __be32 daddr, daddr2; + __be16 source, dest; + bool result; + int element; + + /* Collect information from skb. If it isn't a TCP packet then + * reordering constraints are unknown so deny reordering. + */ + if (skb->protocol == htons(ETH_P_IP)) { + if (ip_hdr(skb)->protocol != IPPROTO_TCP) + return false; + daddr = ip_hdr(skb)->daddr; + } else if (skb->protocol == htons(ETH_P_IPV6)) { + if (ipv6_hdr(skb)->nexthdr != IPPROTO_TCP) + return false; + daddr = ipv6_hdr(skb)->daddr.in6_u.u6_addr32[0] ^ + ipv6_hdr(skb)->daddr.in6_u.u6_addr32[1] ^ + ipv6_hdr(skb)->daddr.in6_u.u6_addr32[2] ^ + ipv6_hdr(skb)->daddr.in6_u.u6_addr32[3]; + } else { + return false; + } + + /* If skb is an ack (i.e. no payload) then reordering is fine. */ + if ((skb->len - skb_transport_offset(skb) - tcp_hdrlen(skb)) == 0) + return true; + + /* If any packets in the list are TCP packets on the same flow + * then deny reordering. The flow check is overconservative, in that + * it may sometimes deny even when the flows aren't the same. + */ + source = tcp_hdr(skb)->source; + dest = tcp_hdr(skb)->dest; + element = 0; + result = true; + spin_lock_bh(&q->qdev->defer_lock); + skb_queue_walk(&q->deferred_tcp, skb2) { + element++; + if (skb2->protocol == htons(ETH_P_IP)) { + if (ip_hdr(skb2)->protocol != IPPROTO_TCP) + continue; + daddr2 = ip_hdr(skb2)->daddr; + } else if (skb2->protocol == htons(ETH_P_IPV6)) { + if (ipv6_hdr(skb2)->nexthdr != IPPROTO_TCP) + continue; + daddr2 = ipv6_hdr(skb2)->daddr.in6_u.u6_addr32[0] ^ + ipv6_hdr(skb2)->daddr.in6_u.u6_addr32[1] ^ + ipv6_hdr(skb2)->daddr.in6_u.u6_addr32[2] ^ + ipv6_hdr(skb2)->daddr.in6_u.u6_addr32[3]; + + } else { + continue; + } + + if (daddr == daddr2 && dest == tcp_hdr(skb2)->dest && + source == tcp_hdr(skb2)->source) { + result = false; + break; + } + } + spin_unlock_bh(&q->qdev->defer_lock); + return result; +} + +/** + * homa_qdisc_defer_tcp() - Add a non-Homa packet to the deferred list for + * a qdisc. + * @q: Qdisc where the packet was submitted. + * @skb: Packet to defer (must not be a Homa packet). + */ +void homa_qdisc_defer_tcp(struct homa_qdisc *q, struct sk_buff *skb) +{ + struct homa_qdisc_dev *qdev = q->qdev; + u64 now = homa_clock(); + + tt_record_tcp("homa_qdisc deferring TCP packet from " + "0x%x to 0x%x, data bytes %d, seq/ack %u", + skb, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); + + spin_lock_bh(&qdev->defer_lock); + __skb_queue_tail(&q->deferred_tcp, skb); + if (list_empty(&q->defer_links)) + list_add_tail(&q->defer_links, &qdev->deferred_qdiscs); + if (qdev->last_defer) + INC_METRIC(nic_backlog_cycles, now - qdev->last_defer); + else + wake_up(&qdev->pacer_sleep); + qdev->last_defer = now; + spin_unlock_bh(&qdev->defer_lock); +} + +/** + * homa_qdisc_defer_homa() - Add a Homa packet to the deferred list for + * a qdev. + * @qdev: Network device for which the packet should be enqueued. + * @skb: Packet to enqueue. + */ +void homa_qdisc_defer_homa(struct homa_qdisc_dev *qdev, struct sk_buff *skb) +{ + struct homa_skb_info *info = homa_get_skb_info(skb); + struct homa_rpc *rpc = info->rpc; + u64 now = homa_clock(); + + spin_lock_bh(&qdev->defer_lock); + __skb_queue_tail(&rpc->qrpc.packets, skb); + if (skb_queue_len(&rpc->qrpc.packets) == 1) { + int bytes_left; + + bytes_left = rpc->msgout.length - info->offset; + if (bytes_left < rpc->qrpc.tx_left) + rpc->qrpc.tx_left = bytes_left; + homa_qdisc_insert_rb(qdev, rpc); + } + if (qdev->last_defer) + INC_METRIC(nic_backlog_cycles, now - qdev->last_defer); + else + wake_up(&qdev->pacer_sleep); + qdev->last_defer = now; + spin_unlock_bh(&qdev->defer_lock); +} + +/** + * homa_qdisc_insert_rb() - Insert an RPC into the deferred_rpcs red-black + * tree. + * @qdev: Network device for the RPC. + * @rpc: RPC to insert. + */ +void homa_qdisc_insert_rb(struct homa_qdisc_dev *qdev, struct homa_rpc *rpc) +{ + struct rb_node **new = &qdev->deferred_rpcs.rb_root.rb_node; + struct rb_node *parent = NULL; + struct homa_rpc *rpc2; + bool leftmost = true; + + while (*new) { + parent = *new; + rpc2 = container_of(*new, struct homa_rpc, qrpc.rb_node); + if (homa_qdisc_precedes(rpc, rpc2)) { + new = &((*new)->rb_left); + } else { + new = &((*new)->rb_right); + leftmost = false; + } + } + + /* Add new node and rebalance tree. */ + rb_link_node(&rpc->qrpc.rb_node, parent, new); + rb_insert_color_cached(&rpc->qrpc.rb_node, &qdev->deferred_rpcs, + leftmost); + + if (qdev->oldest_rpc && rpc->msgout.init_time < + qdev->oldest_rpc->msgout.init_time) + qdev->oldest_rpc = rpc; +} + +/** + * homa_qdisc_xmit_deferred_tcp() - Transmit the "next" non-Homa packet + * that has been deferred for a particular homa_qdisc_dev. + * @qdev: Device on which to transmit packet. + * Return: The number of bytes in the transmitted packet, or 0 if there + * were no deferred TCP packets. + */ +int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev) +{ + struct homa_qdisc *q; + struct sk_buff *skb; + int pkt_len; + + /* When there are deferred TCP packets on multiple queues, we + * will cycle between the queues in round-robin style, transmitting + * one packet from each queue. An earlier implementation kept all + * of the deferred TCP packets on a single global queue for the qdev + * and transmitted them in FIFO fashion. However, this resulted in + * head-of-line blocking where a short message for one queue could + * get stuck behind a long messaage for a different queue, resulting + * in high tail latency. With the round-robin approach, shorter + * messages get transmitted more quickly as long as they don't use + * the same NIC queue as a long message. + */ + + spin_lock_bh(&qdev->defer_lock); + if (list_empty(&qdev->deferred_qdiscs)) { + spin_unlock_bh(&qdev->defer_lock); + return 0; + } + if (qdev->next_qdisc == &qdev->deferred_qdiscs) + q = list_first_entry(&qdev->deferred_qdiscs, struct homa_qdisc, + defer_links); + else + q = list_entry(qdev->next_qdisc, struct homa_qdisc, + defer_links); + qdev->next_qdisc = q->defer_links.next; + skb = __skb_dequeue(&q->deferred_tcp); + if (skb_queue_empty(&q->deferred_tcp)) { + list_del_init(&q->defer_links); + if (!homa_qdisc_any_deferred(qdev)) { + INC_METRIC(nic_backlog_cycles, + homa_clock() - qdev->last_defer); + qdev->last_defer = 0; + } + } + spin_unlock_bh(&qdev->defer_lock); + + pkt_len = qdisc_pkt_len(skb); + homa_qdisc_update_link_idle(qdev, pkt_len, -1); + if (ip_hdr(skb)->protocol == IPPROTO_TCP) + tt_record_tcp("homa_qdisc_pacer requeued TCP packet from " + "0x%x to 0x%x, data bytes %d, seq/ack %u", + skb, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); + homa_qdisc_schedule_skb(skb, qdisc_from_priv(q)); + homa_qdisc_update_congested(q); + return pkt_len; +} + +/** + * homa_qdisc_get_oldest() - Find and return the oldest Homa RPC with deferred + * packets for a qdev. + * @qdev: Info about deferred RPCs is stored here. + * Return: See above. NULL is returned if there are no deferred RPCs in qdev. + */ +struct homa_rpc *homa_qdisc_get_oldest(struct homa_qdisc_dev *qdev) +{ + struct rb_node *node; + struct homa_rpc *rpc; + u64 oldest_time; + + if (qdev->oldest_rpc) + return qdev->oldest_rpc; + qdev->oldest_rpc = NULL; + oldest_time = ~0; + + for (node = rb_first_cached(&qdev->deferred_rpcs); node; + node = rb_next(node)) { + rpc = container_of(node, struct homa_rpc, qrpc.rb_node); + if (rpc->msgout.init_time < oldest_time) { + oldest_time = rpc->msgout.init_time; + qdev->oldest_rpc = rpc; + } + } + return qdev->oldest_rpc; +} + +/** + * homa_qdisc_get_deferred_homa() - Return the highest-priority deferred Homa + * packet and dequeue it from the structures that manage deferred packets. + * @qdev: Info about deferred packets is stored here. + * Return: The next packet to transmit, or NULL if there are no deferred + * Homa packets. + */ +struct sk_buff *homa_qdisc_get_deferred_homa(struct homa_qdisc_dev *qdev) +{ + struct homa_rpc_qdisc *qrpc; + struct homa_skb_info *info; + struct homa_rpc *rpc; + struct rb_node *node; + struct sk_buff *skb; + bool fifo = false; + int bytes_left; + + spin_lock_bh(&qdev->defer_lock); + node = rb_first_cached(&qdev->deferred_rpcs); + if (!node) { + spin_unlock_bh(&qdev->defer_lock); + return NULL; + } + qrpc = container_of(node, struct homa_rpc_qdisc, rb_node); + rpc = container_of(qrpc, struct homa_rpc, qrpc); + if (qdev->srpt_bytes <= 0 && + qdev->hnet->homa->qshared->fifo_fraction != 0) { + fifo = true; + rpc = homa_qdisc_get_oldest(qdev); + qrpc = &rpc->qrpc; + node = &qrpc->rb_node; + } + skb = skb_dequeue(&qrpc->packets); + if (skb_queue_len(&qrpc->packets) == 0) { + rb_erase_cached(node, &qdev->deferred_rpcs); + if (rpc == qdev->oldest_rpc) + qdev->oldest_rpc = NULL; + } + + /* Update qrpc->tx_left and qdev->srpt_bytes. This can increase the + * priority of the RPC in qdev->deferred_rpcs; if this is the FIFO RPC + * then we have to remove it from the tree and reinsert it to make + * sure it's in the right position (if this isn't the FIFO RPC then + * it's position won't change because it is already highest priority). + */ + info = homa_get_skb_info(skb); + bytes_left = rpc->msgout.length - (info->offset + info->data_bytes); + if (bytes_left < qrpc->tx_left) + qrpc->tx_left = bytes_left; + if (fifo) { + if (skb_queue_len(&qrpc->packets) > 0) { + rb_erase_cached(node, &qdev->deferred_rpcs); + homa_qdisc_insert_rb(qdev, rpc); + } + qdev->srpt_bytes += (qdisc_pkt_len(skb) * + qdev->hnet->homa->qshared->fifo_weight) >> + HOMA_FIFO_WEIGHT_SHIFT; + INC_METRIC(pacer_fifo_bytes, qdisc_pkt_len(skb)); + } else { + qdev->srpt_bytes -= qdisc_pkt_len(skb); + } + + if (!homa_qdisc_any_deferred(qdev)) { + INC_METRIC(nic_backlog_cycles, homa_clock() - qdev->last_defer); + qdev->last_defer = 0; + } + spin_unlock_bh(&qdev->defer_lock); + return skb; +} + +/** + * homa_qdisc_xmit_deferred_homa() - Transmit the highest-priority deferred + * Homa packet and dequeue it from the structures that manage deferred packets. + * @qdev: Info about deferred packets is stored here. + * Return: The number of bytes in the transmitted packet (including headers) + * or 0 if there were no deferred Homa packets. + */ +int homa_qdisc_xmit_deferred_homa(struct homa_qdisc_dev *qdev) +{ + struct netdev_queue *txq; + struct homa_data_hdr *h; + struct Qdisc *qdisc; + struct sk_buff *skb; + int pkt_len; + + skb = homa_qdisc_get_deferred_homa(qdev); + if (!skb) + return 0; + + pkt_len = qdisc_pkt_len(skb); + homa_qdisc_update_link_idle(qdev, pkt_len, -1); + h = (struct homa_data_hdr *)skb_transport_header(skb); + tt_record3("homa_qdisc_pacer queuing homa data packet for id %d, offset %d on qid %d", + be64_to_cpu(h->common.sender_id), + homa_get_offset(h), skb_get_queue_mapping(skb)); + + rcu_read_lock_bh(); + txq = netdev_get_tx_queue(skb->dev, skb_get_queue_mapping(skb)); + qdisc = rcu_dereference_bh(txq->qdisc); + if (qdisc->ops == &homa_qdisc_ops) { + homa_qdisc_schedule_skb(skb, qdisc); + homa_qdisc_update_congested(qdisc_priv(qdisc)); + } else { + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); + } + rcu_read_unlock_bh(); + return pkt_len; +} + +/** + * homa_qdisc_free_homa() - Free all of the Homa packets that have been + * deferred for @qdev. + * @qdev: Object whose @homa_deferred list should be emptied. + */ +void homa_qdisc_free_homa(struct homa_qdisc_dev *qdev) +{ + struct sk_buff *skb; + + while (1) { + skb = homa_qdisc_get_deferred_homa(qdev); + if (!skb) + break; + kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE); + } +} + +/** + * homa_qdisc_update_link_idle() - This function is invoked before transmitting + * a packet. If the current NIC queue length is no more than @max_queue_cycles + * then it updates @qdev->link_idle_time to include @bytes; otherwise it does + * nothing. + * @qdev: Information about the device. + * @bytes: Size of a packet that is about to be transmitted; + * includes all headers out through the Ethernet header, + * but not additional overhead such as CRC and gap + * between packets. + * @max_queue_cycles: If it will take longer than this amount of time for + * previously queued bytes to be transmitted, then don't + * update @qdev->link_idle_time. A negative value means + * any length queue is OK. + * Return: Nonzero if @qdev->link_idle_time was updated, false + * if the queue was too long. + */ +int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev, int bytes, + int max_queue_cycles) +{ + u64 idle, new_idle, clock, cycles_for_packet; + + cycles_for_packet = qdev->cycles_per_mibyte; + cycles_for_packet = (cycles_for_packet * + (bytes + HOMA_ETH_FRAME_OVERHEAD)) >> 20; + + /* The following loop may be executed multiple times if there + * are conflicting updates to qdev->link_idle_time. + */ + while (1) { + clock = homa_clock(); + idle = atomic64_read(&qdev->link_idle_time); + if (idle < clock) { + new_idle = clock + cycles_for_packet; + } else { + if (max_queue_cycles >= 0 && (idle - clock) > + max_queue_cycles) + return 0; + new_idle = idle + cycles_for_packet; + } + + if (atomic64_cmpxchg_relaxed(&qdev->link_idle_time, idle, + new_idle) == idle) + break; + INC_METRIC(idle_time_conflicts, 1); + } + return 1; +} + +/** + * homa_qdisc_pacer_main() - Top-level function for a device-specific + * thread that is responsible for transmitting deferred packets on that + * device. + * @device: Pointer to a struct homa_qdisc_dev. + * Return: Always 0. + */ +int homa_qdisc_pacer_main(void *device) +{ + struct homa_qdisc_dev *qdev = device; + int status; + u64 start; + + while (1) { + if (kthread_should_stop()) + break; + start = homa_clock(); + homa_qdisc_pacer(qdev, false); + INC_METRIC(pacer_cycles, homa_clock() - start); + + if (homa_qdisc_any_deferred(qdev)) { + /* There are more packets to transmit (the NIC queue + * must be full); call the pacer again, but first + * give other threads a chance to run (otherwise + * low-level packet processing such as softirq could + * starve). + */ + schedule(); + continue; + } + + tt_record("homa_qdisc pacer sleeping"); + status = wait_event_interruptible(qdev->pacer_sleep, + kthread_should_stop() || homa_qdisc_any_deferred(qdev)); + tt_record1("homa_qdisc pacer woke up with status %d", status); + if (status != 0 && status != -ERESTARTSYS) + break; + } + return 0; +} + +/** + * homa_qdisc_pacer() - Transmit a few packets from the homa_deferred and + * tcp_deferred lists while keeping NIC queue short. There may still be + * deferred packets when this function returns. + * + * Note: this function may be invoked from places other than + * homa_qdisc_pacer_main. The reason for this is that (as of 10/2019) + * Linux's thread scheduler is unpredictable and could neglect the thread + * for long periods of time (e.g., because it is assigned to the same + * CPU as a busy interrupt handler). This can result in poor utilization + * of the network link. So, this method gets invoked from other places as + * well, to increase the likelihood that we keep the link busy. Those other + * invocations are not guaranteed to happen, so the pacer thread provides a + * backstop. + * @qdev: The device on which to transmit. + * @dont_spin: If true, then return immediately if the NIC is congested, + * rather than spinning until congestion drops. If this value + * is false, then the caller must not be running at SoftIRQ + * level, and it must not have acquired a lock that disables + * BH processing (otherwise this function can self-deadlock). + */ +void homa_qdisc_pacer(struct homa_qdisc_dev *qdev, bool dont_spin) +{ + int i, xmit_bytes, max_cycles; + + /* Make sure only one instance of this function executes at a + * time. + */ + if (!spin_trylock(&qdev->pacer_mutex)) + return; + + /* Each iteration through the following loop sends one packet. We + * limit the number of passes through this loop in order to cap the + * time spent in one call to this function (see note in + * homa_qdisc_pacer_main about interfering with softirq handlers). + */ + max_cycles = qdev->hnet->homa->qshared->max_nic_est_backlog_cycles; + for (i = 0; i < 5; i++) { + u64 idle_time, now; + + /* If the NIC is congested, wait for the congestion to + * subside. + */ + now = homa_clock(); + idle_time = atomic64_read(&qdev->link_idle_time); + while (1) { + struct homa_qdisc *congested; + + congested = READ_ONCE(qdev->congested_qdisc); + if (congested && + homa_qdisc_bytes_pending(congested) + <= qdev->max_nic_queue_bytes) { + WRITE_ONCE(qdev->congested_qdisc, NULL); + congested = NULL; + } + if (!congested && (now + max_cycles) >= idle_time) + break; + + /* If we've xmitted at least one packet then + * return (this helps with testing and also + * allows homa_qdisc_pacer_main to yield the core). + */ + if (i != 0 || dont_spin) + goto done; + cpu_relax(); + now = homa_clock(); + UNIT_HOOK("pacer spin"); + } + + /* Note: when we get here, it's possible that the NIC queue is + * still too long because other threads have queued packets, + * but we transmit anyway. If we don't, we could end up in a + * situation where the pacer thread is effectively starved by + * other "helper" threads. + */ + UNIT_HOOK("pacer_xmit"); + + /* Decide whether to transmit a Homa or TCP packet. If + * only one protocol has packets, reset homa_credit to + * prevent negative credit buildup for the protocol + * with packets. + */ + if (list_empty(&qdev->deferred_qdiscs)) { + if (!rb_first_cached(&qdev->deferred_rpcs)) + break; + qdev->homa_credit = 1; + } else if (!rb_first_cached(&qdev->deferred_rpcs)) { + qdev->homa_credit = 0; + } + if (qdev->homa_credit > 0) { + xmit_bytes = homa_qdisc_xmit_deferred_homa(qdev); + if (xmit_bytes > 0) { + INC_METRIC(pacer_homa_packets, 1); + INC_METRIC(pacer_homa_bytes, xmit_bytes); + qdev->homa_credit -= xmit_bytes * (100 - + qdev->hnet->homa->qshared->homa_share); + } + } else { + xmit_bytes = homa_qdisc_xmit_deferred_tcp(qdev); + if (xmit_bytes > 0) { + INC_METRIC(pacer_tcp_packets, 1); + INC_METRIC(pacer_tcp_bytes, xmit_bytes); + qdev->homa_credit += xmit_bytes * + qdev->hnet->homa->qshared->homa_share; + } + } + if (dont_spin) + INC_METRIC(pacer_help_bytes, xmit_bytes); + INC_METRIC(pacer_xmit_cycles, homa_clock() - now); + } +done: + spin_unlock(&qdev->pacer_mutex); +} + +/** + * homa_qdisc_pacer_check() - Check whether any of the homa_qdisc pacer + * threads associated with @homa have fallen behind (e.g. because they + * got descheduled by Linux). If so, call the pacer directly to transmit + * deferred packets. + * @homa: Overall information about the Homa transport; used to find + * homa_qdisc_devs to check. + */ +void homa_qdisc_pacer_check(struct homa *homa) +{ + struct homa_qdisc_dev *qdev; + u64 now = homa_clock(); + int max_cycles; + + max_cycles = homa->qshared->max_nic_est_backlog_cycles; + rcu_read_lock(); + list_for_each_entry_rcu(qdev, &homa->qshared->qdevs, links) { + if (!homa_qdisc_any_deferred(qdev)) + continue; + + /* The ">> 1" means that we only help out if the NIC queue has + * dropped below half of its maximum allowed capacity. This + * gives the pacer thread the first shot at queuing new + * packets. + */ + if (now + (max_cycles >> 1) < + atomic64_read(&qdev->link_idle_time)) + continue; + tt_record("homa_qdisc_pacer_check calling homa_qdisc_pacer"); + homa_qdisc_pacer(qdev, true); + } + rcu_read_unlock(); +} + +/** + * homa_qdisc_dointvec() - This function is a wrapper around proc_dointvec. It + * is invoked to read and write pacer-related sysctl values. + * @table: sysctl table describing value to be read or written. + * @write: Nonzero means value is being written, 0 means read. + * @buffer: Address in user space of the input/output data. + * @lenp: Not exactly sure. + * @ppos: Not exactly sure. + * + * Return: 0 for success, nonzero for error. + */ +int homa_qdisc_dointvec(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table table_copy; + struct homa_qdisc_shared *qshared; + int result; + + qshared = homa_net(current->nsproxy->net_ns)->homa->qshared; + + /* Generate a new ctl_table that refers to a field in the + * net-specific struct homa. + */ + table_copy = *table; + table_copy.data = ((char *)qshared) + (uintptr_t)table_copy.data; + + result = proc_dointvec(&table_copy, write, buffer, lenp, ppos); + if (write) + homa_qdisc_update_sysctl_deps(qshared); + return result; +} + +/** + * homa_qdev_update_sysctl() - Recompute information in a homa_qdisc_dev + * that depends on sysctl parameters. + * @qdev: Update information here that depends on sysctl values. + */ +void homa_qdev_update_sysctl(struct homa_qdisc_dev *qdev) +{ + struct ethtool_link_ksettings ksettings; + struct homa *homa = qdev->hnet->homa; + const struct ethtool_ops *ops; + u64 tmp, tmp2; + + qdev->link_mbps = homa->link_mbps; + ops = qdev->dev->ethtool_ops; + if (ops && ops->get_link_ksettings) { + if (ops->get_link_ksettings(qdev->dev, &ksettings) == 0) + qdev->link_mbps = ksettings.base.speed; + } + + /* Must reset srpt_bytes: if qshared->fifo_fraction was previously + * zero, srpt_bytes could be an enormous negative number. Without + * a reset, the pacer could transmit exclusively FIFO for a long time. + */ + qdev->srpt_bytes = 0; + + /* Compute cycles_per_mibyte based on the link speed (mibytes/sec) + * and max_link_usage: + * + * cycles/sec + * cycles/mibyte = (100/max_link_usage) * ------------- + * mibytes/sec + * + * 100 * homa_clock_khz() * 1000 + * = -------------------------------------------------- + * max_link_usage * link_mbps * (1000000 / 1<<20) / 8 + * + * 8 * homa_clock_khz() 1<<20 + * = ----------------------------- * ------- + * max_link_usage * link_mbps 10 + */ + tmp = 8ULL * homa_clock_khz(); + tmp <<= 20; + tmp2 = 10ULL * homa->qshared->max_link_usage * qdev->link_mbps; + do_div(tmp, tmp2); + qdev->cycles_per_mibyte = tmp; + + qdev->max_nic_queue_bytes = (homa->qshared->max_nic_queue_usecs * + qdev->link_mbps) >> 3; +} + +/** + * homa_qdisc_update_sysctl_deps() - Update any qdisc fields that depend + * on values set by sysctl. This function is invoked anytime a qdisc sysctl + * value is updated. + * @qshared: Qdisc data to update. + */ +void homa_qdisc_update_sysctl_deps(struct homa_qdisc_shared *qshared) +{ + struct homa_qdisc_dev *qdev; + u64 tmp; + + if (qshared->fifo_fraction > 0) { + tmp = (1000 - qshared->fifo_fraction) << HOMA_FIFO_WEIGHT_SHIFT; + do_div(tmp, qshared->fifo_fraction); + qshared->fifo_weight = tmp; + } + + qshared->max_nic_est_backlog_cycles = homa_ns_to_cycles(1000 * + qshared->max_nic_est_backlog_usecs); + + if (qshared->homa_share < 0) + qshared->homa_share = 0; + if (qshared->homa_share > 100) + qshared->homa_share = 100; + if (qshared->max_link_usage < 5) + qshared->max_link_usage = 5; + if (qshared->max_link_usage > 100) + qshared->max_link_usage = 100; + + /* Use a mutex rather than RCU to prevent qdev deletion while we + * traverse the list. This is more expensive, but RCU isn't safe + * because homa_qdev_update_sysctl may block (and efficiency isn't + * paramount here). + */ + mutex_lock(&qshared->mutex); + list_for_each_entry_rcu(qdev, &qshared->qdevs, links) + homa_qdev_update_sysctl(qdev); + mutex_unlock(&qshared->mutex); +} diff --git a/homa_qdisc.h b/homa_qdisc.h new file mode 100644 index 00000000..ab3c9ad9 --- /dev/null +++ b/homa_qdisc.h @@ -0,0 +1,457 @@ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ + +/* This file contains definitions related to Homa's special-purpose + * queuing discipline + */ + +#include "homa_rpc.h" + +#ifdef __UNIT_TEST__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" +#endif /* __UNIT_TEST__*/ +#include +#ifdef __UNIT_TEST__ +#pragma GCC diagnostic pop +#endif /* __UNIT_TEST__*/ + +#include +#include + +#ifndef _HOMA_QDISC_H +#define _HOMA_QDISC_H + +/** + * struct homa_qdisc - Contains Homa-specific data for a single instance of + * the homa queuing discipline. + */ +struct homa_qdisc { + /** @qdisc: The Qdisc that this structure is associated with. */ + struct Qdisc *qdisc; + + /** @qdev: Info shared among all qdiscs for a net_device. */ + struct homa_qdisc_dev *qdev; + + /** + * @ix: Index of this qdisc's transmit queue among all those for + * its net_device. + */ + int ix; + + /** + * @deferred_tcp: List of non-Homa packets for this qdisc that have + * been deferred because of NIC overload, in order of arrival. + * Synchronize with qdev->defer_lock. + */ + struct sk_buff_head deferred_tcp; + + /** + * @defer_links: Used to link this object into qdev->deferred_qdiscs + * when deferred_tcp is nonempty. This will be an empty list if + * deferred_tcp is nonempty. Synchronized with qdev->defer_lock. + */ + struct list_head defer_links; +}; + +/** + * struct homa_qdisc_dev - Contains information shared across all of the + * homa_qdiscs associated with a net_device. + */ +struct homa_qdisc_dev { + /** @dev: Device common to all qdiscs using this struct. */ + struct net_device *dev; + + /** + * @hnet: Homa's information about the network namespace + * this object belongs to. + */ + struct homa_net *hnet; + + /** + * @refs: Reference count (e.g. includes one reference for each + * homa_qdisc that references this object). Must hold + * hnet->qdisc_devs_lock to access. + */ + refcount_t refs; + + /** @link_mbps: Speed of the link associated with @dev, in Mbps. */ + int link_mbps; + + /** + * @cycles_per_mibyte: The number of homa_clock cycles that it takes + * to transmit 2**20 bytes on the link associated with @dev; computed + * from @link_mbps. This is actually a slight overestimate (if we + * underestimate, the link queue could grow without bound during + * periods of high traffic). + */ + int cycles_per_mibyte; + + /** + * @links: Used to link this object into the qdevs list in a + * homa_qdisc_shared struct. + */ + struct list_head links; + + /** + * @link_idle_time: The time, measured by homa_clock, at which we + * estimate that all of the packets passed to @dev will have been + * transmitted, assuming the NIC can transmit at full link speed. + * May be in the past. See the PACING comment at the top of + * homa_qdisc.c for a discussion of the pacing mechanism. + */ + atomic64_t link_idle_time __aligned(L1_CACHE_BYTES); + + /** + * @deferred_rpcs: Contains all homa_rpc's with deferred packets, in + * SRPT order. + */ + struct rb_root_cached deferred_rpcs; + + /** + * @oldest_rpc: The RPC in deferred_rpcs with the oldest init_time, or + * NULL if not currently known. + */ + struct homa_rpc *oldest_rpc; + + /** + * @srpt_bytes: The number of bytes that should be transmitted from + * SRPT packets before transmitting a FIFO packet. <= 0 means + * the next packet transmission should be FIFO. + */ + s64 srpt_bytes; + + /** + * @deferred_qdiscs: List of all homa_qdiscs with non-Homa packets + * that have been deferred because of NIC overload. + */ + struct list_head deferred_qdiscs; + + /** + * @next_qdisc: Points to either the defer_links field in a homa_qdisc + * or to deferred_qdiscs above. Used to select the next non-Homa packet + * for transmission. Note: this may refer to deferred_qdiscs even when + * deferred_qdiscs is nonempty. + */ + struct list_head *next_qdisc; + + /** + * @last_defer: The most recent homa_clock() time when a packet was + * deferred, or 0 if there are currently no deferred packets. + */ + u64 last_defer; + + /** + * @max_nic_queue_bytes: The number of bytes corresponding to + * qdev->max_nic_queue_usecs. + */ + int max_nic_queue_bytes; + + /** + * @congested_qdisc: If non-NULL, this variable identifies a qdisc + * whose NIC queue is overloaded according to @homa_max_nic_queue_bytes. + * NULL means no queue is currently known to be congested. This + * variable is accessed without synchronization. See the PACING comment + * at the top of homa_qdisc.c for a discussion of the packet pacing + * architecture. + */ + struct homa_qdisc *congested_qdisc; + + /** + * @defer_lock: Synchronizes access to information about deferred + * packets, including deferred_rpcs, deferred_qdiscs, next_qdisc, + * last_defer, and some information in homa_qdiscs. + */ + spinlock_t defer_lock; + + /** + * @homa_credit: When there are both Homa and TCP deferred packets, + * this is used to balance output between them according to the + * homa_share sysctl value. Positive means that Homa packets should + * be transmitted next, zero or negative means TCP. When a TCP + * packet is transmitted, this is incremented by the packet length + * times homa_share; when a Homa packet is transmitted, it is + * decremented by packet length times (100 - homa_share). Used only + * by the pacer, so no need for synchronization. + */ + int homa_credit; + + /** + * @pacer_kthread: Kernel thread that eventually transmits packets + * on homa_deferred and tcp_deferred. + */ + struct task_struct *pacer_kthread; + + /** + * @pacer_sleep: Used to block the pacer thread when there + * are no throttled RPCs. + */ + struct wait_queue_head pacer_sleep; + + /** + * @pacer_mutex: Ensures that only one instance of + * homa_qdisc_pacer runs at a time. Only used in "try" mode: + * never block on this. Note: must not disable bh when acquiring + * this lock, because the pacer may wait for bh activity to + * complete. + */ + spinlock_t pacer_mutex ____cacheline_aligned_in_smp; + + /** + * @rcu_head: Holds state of a pending call_rcu invocation when + * this struct is deleted. + */ + struct rcu_head rcu_head; +}; + +/** + * struct homa_qdisc_shared - There is one of these structs for each + * struct homa. Contains information that is shared across all homq_qdiscs + * and homa_qdisc_devs for the struct homa. + */ +struct homa_qdisc_shared { + /** + * @mutex: Must hold when modifying qdevs. Can scan qdevs + * without locking using RCU. + */ + struct mutex mutex; + + /** + * @qdevs: RCU list of all homa_qdisc_devs that currently + * exist for this struct homa. + */ + struct list_head qdevs; + + /** + * @fifo_fraction: Out of every 1000 packets transmitted by the + * pacer, this number will be transmitted from the oldest message + * rather than the highest-priority message. Set externally via + * sysctl. + */ + int fifo_fraction; + + /** + * @fifo_weight: Determines how much qdev->fifo_count is updated + * when a FIFO packet is transmitted (for each FIFO byte transmitted, + * @fifo_weight >> HOMA_FIFO_WEIGHT_SHIFT SRPT bytes should be + * transmitted); computed from @fifo_fraction. Valid only if + * fifo_fraction is nonzero. + */ + int fifo_weight; +#define HOMA_FIFO_WEIGHT_SHIFT 10 + + /** + * @max_nic_est_backlog_usecs: Limits the NIC queue length: we won't + * queue packets in the NIC for transmission if link_idle_time is + * this many nanoseconds in the future (or more). Set externally via + * sysctl. + */ + int max_nic_est_backlog_usecs; + + /** + * @max_nic_est_backlog_cycles: Same as max_nic_est_backlog_usecs + * except in homa_clock() units. + */ + int max_nic_est_backlog_cycles; + + /** + * @max_nic_queue_usecs: An additional limit on NIC queue buildup: + * if any individual NIC queue reaches a length where it would + * take at least this many microseconds to transmit all of its packets, + * then no more packets will be queued for *any* NIC queue until + * the queue gets below this limit. Set externally via sysctl. + */ + int max_nic_queue_usecs; + + /** + * @defer_min_bytes: If a packet has fewer bytes than this, then it + * will be transmitted immediately, regardless of NIC queue length. + * We have this limit because for very small packets CPU overheads + * make it impossible to keep up with the NIC so (a) the NIC queue + * can't grow and (b) using the pacer would serialize all of these + * packets through a single core, which makes things even worse. + * Set externally via sysctl. + */ + int defer_min_bytes; + + /** + * @homa_share: When the uplink is overloaded, this determines how + * to share bandwidth between TCP and Homa. It gives the percentage + * of bandwidth that Homa will receive; TCP (and all other protocols, + * such as UDP) get the remainder. Must be between 0 and 100, + * inclusive. + */ + int homa_share; + + /** + * @max_link_usage: An integer <= 100 indicating the maximum percentage + * of uplink bandwidth that Homa will attempt to utilize. A smaller + * value reduces the likelihood of queue buildup in the NIC, but + * also prevents full link utilization. + */ + int max_link_usage; + +#ifndef __STRIP__ /* See strip.py */ + /** + * @sysctl_header: Used to remove sysctl values when this structure + * is destroyed. + */ + struct ctl_table_header *sysctl_header; +#endif /* See strip.py */ +}; + +/** + * struct homa_rcu_kfreer - Used by homa_rcu_kfree to defer kfree-ing + * an object until it is RCU-safe. + */ +struct homa_rcu_kfreer { + /** @rcu_head: Holds state of a pending call_rcu invocation. */ + struct rcu_head rcu_head; + + /** @object: Kfree this after waiting until RCU has synced. */ + void *object; +}; + +void homa_qdev_update_sysctl(struct homa_qdisc_dev *qdev); +bool homa_qdisc_can_bypass(struct sk_buff *skb, + struct homa_qdisc *q); +void homa_qdisc_defer_homa(struct homa_qdisc_dev *qdev, + struct sk_buff *skb); +void homa_qdisc_defer_tcp(struct homa_qdisc *q, struct sk_buff *skb); +void homa_qdisc_destroy(struct Qdisc *sch); +void homa_qdisc_dev_callback(struct rcu_head *head); +int homa_qdisc_dointvec(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free); +void homa_qdisc_free_homa(struct homa_qdisc_dev *qdev); +struct sk_buff *homa_qdisc_get_deferred_homa(struct homa_qdisc_dev *qdev); +struct homa_rpc * + homa_qdisc_get_oldest(struct homa_qdisc_dev *qdev); +int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack); +void homa_qdisc_insert_rb(struct homa_qdisc_dev *qdev, + struct homa_rpc *rpc); +void homa_qdisc_pacer(struct homa_qdisc_dev *qdev, bool dont_spin); +void homa_qdisc_pacer_check(struct homa *homa); +int homa_qdisc_pacer_main(void *device); +struct homa_qdisc_dev * + homa_qdisc_qdev_get(struct net_device *dev); +void homa_qdisc_qdev_put(struct homa_qdisc_dev *qdev); +int homa_qdisc_register(void); +struct homa_qdisc_shared * + homa_qdisc_shared_alloc(void); +void homa_qdisc_shared_free(struct homa_qdisc_shared *qshared); +void homa_qdisc_unregister(void); +int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev, + int bytes, int max_queue_ns); +void homa_qdisc_update_sysctl_deps(struct homa_qdisc_shared *qshared); +int homa_qdisc_xmit_deferred_homa(struct homa_qdisc_dev *qdev); +int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev); +void homa_rcu_kfree(void *object); +void homa_rcu_kfree_callback(struct rcu_head *head); + +/** + * homa_qdisc_active() - Return true if homa qdiscs are enabled for @hnet + * (so the old pacer should not be used), false otherwise. + * @homa: Information about the Homa transport. + * Return: See above. + */ +static inline bool homa_qdisc_active(struct homa *homa) +{ + return list_first_or_null_rcu(&homa->qshared->qdevs, + struct homa_qdisc_dev, links) != NULL; +} + +/** + * homa_qdisc_rpc_init() - Initialize a homa_rpc_qdisc struct. + * @qrpc: Struct to initialize + */ +static inline void homa_qdisc_rpc_init(struct homa_rpc_qdisc *qrpc) +{ + skb_queue_head_init(&qrpc->packets); + qrpc->tx_left = HOMA_MAX_MESSAGE_LENGTH; +} + +/** + * homa_qdisc_any_deferred() - Returns true if there are currently any + * deferred packets in a homa_qdisc_dev, false if there are none. + * @qdev: Holds info about deferred packets. + * Return: See above. + */ +static inline bool homa_qdisc_any_deferred(struct homa_qdisc_dev *qdev) +{ + return rb_first_cached(&qdev->deferred_rpcs) || + !list_empty(&qdev->deferred_qdiscs); +} + +/** + * homa_qdisc_schedule_skb() - Enqueue an skb on a qdisc and schedule the + * qdisc for execution. + * @skb: Packet buffer to queue for output + * @qdisc: homa_qdisc on which to schedule it. + */ +static inline void homa_qdisc_schedule_skb(struct sk_buff *skb, + struct Qdisc *qdisc) { + spin_lock_bh(qdisc_lock(qdisc)); + qdisc_enqueue_tail(skb, qdisc); + spin_unlock_bh(qdisc_lock(qdisc)); + __netif_schedule(qdisc); +} + +/** + * homa_qdisc_precedes() - Return true if @rpc1 is considered "less" than + * @rpc2 (i.e. higher priority) for the purposes of qdev->deferred_rpcs, or + * false if @rpc1 is consdered "greater" (ties not allowed). + * @rpc1: RPC to compare + * @rpc2: RPC to compare; must be different from rpc1 + * Return: See above + */ +static inline bool homa_qdisc_precedes(struct homa_rpc *rpc1, + struct homa_rpc *rpc2) +{ + /* The primary metric for comparison is bytes left to transmit; + * in case of ties, use RPC age as secondar metric (oldest RPC + * is "less"), and if still tied (highly unlikely) use the + * addresses of the RPCs as a tie-breaker. + */ + if (rpc1->qrpc.tx_left < rpc2->qrpc.tx_left) + return true; + else if (rpc2->qrpc.tx_left < rpc1->qrpc.tx_left) + return false; + if (rpc1->msgout.init_time < rpc2->msgout.init_time) + return true; + else if (rpc2->msgout.init_time < rpc1->msgout.init_time) + return false; + return rpc1 < rpc2; +} + +/** + * homa_qdisc_bytes_pending() - Return the total number of bytes in skbs + * that have been enqueued in the NIC for transmission via a given queue + * but have not yet been returned after transmission. + * @q: Return the pending bytes for the devqueue asssociated with + * this qdisc. + * Return: See above + */ +static inline int homa_qdisc_bytes_pending(struct homa_qdisc *q) +{ + /* Ideally this function would be provided by dynamic_queue_limits.h + * so that we don't have to root around in its data structures. + */ + struct dql *dql = &qdisc_from_priv(q)->dev_queue->dql; + + return READ_ONCE(dql->num_queued) - READ_ONCE(dql->num_completed); +} + +/** + * homa_qdisc_update_congested() - If the NIC queue for a qdisc has + * become too long, record the fact that this qdisc is congested. + * @q: qdisc whose netdev_queue should be checked. + */ +static inline void homa_qdisc_update_congested(struct homa_qdisc *q) +{ + if (homa_qdisc_bytes_pending(q) > q->qdev->max_nic_queue_bytes) + WRITE_ONCE(q->qdev->congested_qdisc, q); +} + +#endif /* _HOMA_QDISC_H */ diff --git a/homa_receiver.cc b/homa_receiver.cc index b259e241..5839cb75 100644 --- a/homa_receiver.cc +++ b/homa_receiver.cc @@ -1,16 +1,5 @@ -/* Copyright (c) 2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2022 Homa Developers + * SPDX-License-Identifier: BSD-1-Clause */ #include @@ -21,7 +10,7 @@ * homa::receiver::homa() - Constructor for receivers. * @fd: Homa socket from which this object will receive incoming * messages. The caller is responsible for setting up buffering - * on the socket using setsockopt with the SO_HOMA_SET_BUF option. + * on the socket using setsockopt with the SO_HOMA_RCVBUF option. * The file descriptor must be valid for the lifetime of this * object. * @buf_region: Location of the buffer region that was allocated for @@ -63,13 +52,14 @@ homa::receiver::~receiver() */ void homa::receiver::copy_out(void *dest, size_t offset, size_t count) const { - ssize_t limit = offset + count; char *cdest = static_cast(dest); + ssize_t limit = offset + count; if (limit > msg_length) limit = msg_length; while (static_cast(offset) < limit) { size_t chunk_size = contiguous(offset); + memcpy(cdest, get(offset), chunk_size); offset += chunk_size; cdest += chunk_size; @@ -79,12 +69,10 @@ void homa::receiver::copy_out(void *dest, size_t offset, size_t count) const /** * homa::receiver::receive() - Release resources for the current message, if * any, and receive a new incoming message. - * @flags: Various OR'ed bits such as HOMA_RECVMSG_REQUEST and - * HOMA_RECVMSG_NONBLOCKING. See the Homa documentation - * for the flags field of recvmsg for details. - * @id: Identifier of a particular RPC whose result is desired, - * or 0. See the Homa documentation for the id field of - * recvmsg for details. + * @flags: Flag bits for the recvmsg invocation (e.g., MSG_DONTWAIT). + * @id: Identifier of a private RPC whose result is desired, or 0 + * to wait for a shared RPC. See the Homa documentation for the id + * field of recvmsg for details. * Return: The length of the new active message. If an error occurs, -1 * is returned and additional information is available in * errno. Note: if id() returns a nonzero result after an @@ -93,10 +81,10 @@ void homa::receiver::copy_out(void *dest, size_t offset, size_t count) const */ size_t homa::receiver::receive(int flags, uint64_t id) { - control.flags = flags; control.id = id; hdr.msg_namelen = sizeof(source); - msg_length = recvmsg(fd, &hdr, 0); + hdr.msg_controllen = sizeof(control); + msg_length = recvmsg(fd, &hdr, flags); if (msg_length < 0) { control.num_bpages = 0; id = 0; @@ -115,9 +103,8 @@ void homa::receiver::release() return; /* This recvmsg request will do nothing except return buffer space. */ - control.flags = HOMA_RECVMSG_NONBLOCKING; control.id = 0; - recvmsg(fd, &hdr, 0); + recvmsg(fd, &hdr, MSG_DONTWAIT); control.num_bpages = 0; msg_length = -1; } \ No newline at end of file diff --git a/homa_receiver.h b/homa_receiver.h index 6167ad06..ca367831 100644 --- a/homa_receiver.h +++ b/homa_receiver.h @@ -1,34 +1,21 @@ -/* Copyright (c) 2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ #pragma once #include #include +extern "C" { #include "homa.h" +} namespace homa { - -/** - * class homa::receiver - Helper class for receiving a series of messages - * from a Homa socket. This class serves two purposes: first, it implements - * the application side of the Homa buffer management protocol, returning - * receive buffer space to Homa when the application longer needs it. Second, - * it provides convenience methods for accessing messages that are scattered\ - * over several discontiguous regions of buffer space. +/* Helper class for receiving a series of messages from a Homa socket. This + * class serves two purposes: first, it implements the application side of + * the Homa buffer management protocol, returning receive buffer space to + * Homa when the application longer needs it. Second, it provides convenience + * methods for accessing messages that are scattered over several discontiguous + * regions of buffer space. * * Typical usage: * - Call receive, which will invoke Homa to receive an incoming message. @@ -59,9 +46,9 @@ class receiver { { if (static_cast(offset) >= msg_length) return 0; - if ((offset >> HOMA_BPAGE_SHIFT) == (control.num_bpages-1)) + if ((offset >> HOMA_BPAGE_SHIFT) == (control.num_bpages - 1)) return msg_length - offset; - return HOMA_BPAGE_SIZE - (offset & (HOMA_BPAGE_SIZE-1)); + return HOMA_BPAGE_SIZE - (offset & (HOMA_BPAGE_SIZE - 1)); } /** @@ -69,7 +56,7 @@ class receiver { * cookie associated with the current message; result is undefined * if there is no current message. */ - uint64_t completion_cookie() const + uint64_t completion_cookie(void) const { return control.completion_cookie; } @@ -94,6 +81,7 @@ class receiver { template inline T* get(size_t offset, T* storage = nullptr) const { int buf_num = offset >> HOMA_BPAGE_SHIFT; + if (static_cast(offset + sizeof(T)) > msg_length) return nullptr; if (contiguous(offset) >= sizeof(T)) @@ -109,7 +97,7 @@ class receiver { * id() - Return the Homa RPC identifier for the current message, * or 0 if there is no current message. */ - inline uint64_t id() const + inline uint64_t id(void) const { return control.id; } @@ -119,7 +107,7 @@ class receiver { * is a request, and false if it is a response or if there is no * current message. */ - bool is_request() const + bool is_request(void) const { return control.id & 1; } @@ -129,22 +117,22 @@ class receiver { * current message, or a negative value if there is no current * message. */ - ssize_t length() const + ssize_t length(void) const { return msg_length; } size_t receive(int flags, uint64_t id); - void release(); + void release(void); /** * homa::receiver::src_addr() - Return a pointer to the address * of the sender of the current message. The result is undefined * if there is no current message. */ - const sockaddr_in_union *src_addr() const + const struct sockaddr *src_addr(void) const { - return &source; + return &source.sa; } protected: @@ -163,7 +151,11 @@ class receiver { struct homa_recvmsg_args control; /** @source: Address of the node that sent the current message. */ - sockaddr_in_union source; + union { + struct sockaddr sa; + struct sockaddr_in in4; + struct sockaddr_in6 in6; + } source; /** @length: Length of the current message, or < 0 if none. */ ssize_t msg_length; @@ -171,4 +163,5 @@ class receiver { /** @buf_region: First byte of buffer space for this message. */ char *buf_region; }; -} // namespace homa \ No newline at end of file + +} // namespace homa diff --git a/homa_rpc.c b/homa_rpc.c new file mode 100644 index 00000000..1b6a36e2 --- /dev/null +++ b/homa_rpc.c @@ -0,0 +1,831 @@ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +/* This file contains functions for managing homa_rpc structs. */ + +#include "homa_impl.h" +#include "homa_interest.h" +#include "homa_peer.h" +#include "homa_pool.h" + +#ifndef __STRIP__ /* See strip.py */ +#include "homa_grant.h" +#include "homa_pacer.h" +#include "homa_qdisc.h" +#include "homa_skb.h" +#else /* See strip.py */ +#include "homa_stub.h" +#endif /* See strip.py */ + +/** + * homa_rpc_alloc_client() - Allocate and initialize a client RPC (one that + * is used to issue an outgoing request). Doesn't send any packets. Invoked + * with no locks held. + * @hsk: Socket to which the RPC belongs. + * @dest: Address of host (ip and port) to which the RPC will be sent. + * + * Return: A printer to the newly allocated object, or a negative + * errno if an error occurred. The RPC will be locked; the + * caller must eventually unlock it. Sets hsk->error_msg on errors. + */ +struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, + const union sockaddr_in_union *dest) + __cond_acquires(crpc->bucket->lock) +{ + struct in6_addr dest_addr_as_ipv6 = canonical_ipv6_addr(dest); + struct homa_rpc_bucket *bucket; + struct homa_rpc *crpc; + int err; + + crpc = kzalloc(sizeof(*crpc), GFP_KERNEL); + if (unlikely(!crpc)) { + hsk->error_msg = "couldn't allocate memory for client RPC"; + return ERR_PTR(-ENOMEM); + } + + /* Initialize fields that don't require the socket lock. */ + crpc->hsk = hsk; + crpc->id = atomic64_fetch_add(2, &hsk->homa->next_outgoing_id); + bucket = homa_client_rpc_bucket(hsk, crpc->id); + crpc->bucket = bucket; + crpc->state = RPC_OUTGOING; + refcount_set(&crpc->refs, 1); + crpc->peer = homa_peer_get(hsk, &dest_addr_as_ipv6); + if (IS_ERR(crpc->peer)) { + err = PTR_ERR(crpc->peer); + crpc->peer = NULL; + goto error; + } + crpc->dport = ntohs(dest->in6.sin6_port); + crpc->msgin.length = -1; + crpc->msgout.length = -1; + IF_NO_STRIP(homa_qdisc_rpc_init(&crpc->qrpc)); + INIT_LIST_HEAD(&crpc->ready_links); + INIT_LIST_HEAD(&crpc->buf_links); + INIT_LIST_HEAD(&crpc->dead_links); +#ifndef __STRIP__ /* See strip.py */ + INIT_LIST_HEAD(&crpc->grantable_links); +#endif /* See strip.py */ + INIT_LIST_HEAD(&crpc->throttled_links); + crpc->resend_timer_ticks = hsk->homa->timer_ticks; + crpc->magic = HOMA_RPC_MAGIC; + crpc->start_time = homa_clock(); + + /* Initialize fields that require locking. This allows the most + * expensive work, such as copying in the message from user space, + * to be performed without holding locks. Also, can't hold spin + * locks while doing things that could block, such as memory allocation. + */ + homa_bucket_lock(bucket, crpc->id); + homa_sock_lock(hsk); + if (hsk->shutdown) { + homa_sock_unlock(hsk); + homa_rpc_unlock(crpc); + hsk->error_msg = "socket has been shut down"; + err = -ESHUTDOWN; + goto error; + } + hlist_add_head(&crpc->hash_links, &bucket->rpcs); + rcu_read_lock(); + list_add_tail_rcu(&crpc->active_links, &hsk->active_rpcs); + rcu_read_unlock(); + homa_sock_unlock(hsk); + + return crpc; + +error: + if (crpc->peer) + homa_peer_release(crpc->peer); + kfree(crpc); + return ERR_PTR(err); +} + +/** + * homa_rpc_alloc_server() - Allocate and initialize a server RPC (one that is + * used to manage an incoming request). If appropriate, the RPC will also + * be handed off (we do it here, while we have the socket locked, to avoid + * acquiring the socket lock a second time later for the handoff). + * @hsk: Socket that owns this RPC. + * @source: IP address (network byte order) of the RPC's client. + * @h: Header for the first data packet received for this RPC; used + * to initialize the RPC. + * @created: Will be set to 1 if a new RPC was created and 0 if an + * existing RPC was found. + * + * Return: A pointer to a new RPC, which is locked, or a negative errno + * if an error occurred. If there is already an RPC corresponding + * to h, then it is returned instead of creating a new RPC. + */ +struct homa_rpc *homa_rpc_alloc_server(struct homa_sock *hsk, + const struct in6_addr *source, + struct homa_data_hdr *h, int *created) + __cond_acquires(srpc->bucket->lock) +{ + u64 id = homa_local_id(h->common.sender_id); + struct homa_rpc_bucket *bucket; + struct homa_rpc *srpc = NULL; + int err; + + if (!hsk->buffer_pool) + return ERR_PTR(-ENOMEM); + + /* Lock the bucket, and make sure no-one else has already created + * the desired RPC. + */ + bucket = homa_server_rpc_bucket(hsk, id); + homa_bucket_lock(bucket, id); + hlist_for_each_entry(srpc, &bucket->rpcs, hash_links) { + if (srpc->id == id && + srpc->dport == ntohs(h->common.sport) && + ipv6_addr_equal(&srpc->peer->addr, source)) { + /* RPC already exists; just return it instead + * of creating a new RPC. + */ + *created = 0; + return srpc; + } + } + + /* Initialize fields that don't require the socket lock. */ + srpc = kzalloc(sizeof(*srpc), GFP_ATOMIC); + if (!srpc) { + err = -ENOMEM; + goto error; + } + srpc->hsk = hsk; + srpc->bucket = bucket; + srpc->state = RPC_INCOMING; + refcount_set(&srpc->refs, 1); + srpc->peer = homa_peer_get(hsk, source); + if (IS_ERR(srpc->peer)) { + err = PTR_ERR(srpc->peer); + srpc->peer = NULL; + goto error; + } + srpc->dport = ntohs(h->common.sport); + srpc->id = id; + srpc->msgin.length = -1; + srpc->msgout.length = -1; + IF_NO_STRIP(homa_qdisc_rpc_init(&srpc->qrpc)); + INIT_LIST_HEAD(&srpc->ready_links); + INIT_LIST_HEAD(&srpc->buf_links); + INIT_LIST_HEAD(&srpc->dead_links); +#ifndef __STRIP__ /* See strip.py */ + INIT_LIST_HEAD(&srpc->grantable_links); +#endif /* See strip.py */ + INIT_LIST_HEAD(&srpc->throttled_links); + srpc->resend_timer_ticks = hsk->homa->timer_ticks; + srpc->magic = HOMA_RPC_MAGIC; + srpc->start_time = homa_clock(); +#ifndef __STRIP__ /* See strip.py */ + tt_record2("Incoming message for id %d has %d unscheduled bytes", + srpc->id, ntohl(h->incoming)); +#endif /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ + err = homa_message_in_init(srpc, ntohl(h->message_length), + ntohl(h->incoming)); +#else /* See strip.py */ + err = homa_message_in_init(srpc, ntohl(h->message_length)); +#endif /* See strip.py */ + if (err != 0) + goto error; + + /* Initialize fields that require socket to be locked. */ + homa_sock_lock(hsk); + if (hsk->shutdown) { + homa_sock_unlock(hsk); + err = -ESHUTDOWN; + goto error; + } + hlist_add_head(&srpc->hash_links, &bucket->rpcs); + list_add_tail_rcu(&srpc->active_links, &hsk->active_rpcs); + homa_sock_unlock(hsk); + if (ntohl(h->seg.offset) == 0 && srpc->msgin.num_bpages > 0) { + set_bit(RPC_PKTS_READY, &srpc->flags); + homa_rpc_handoff(srpc); + } + INC_METRIC(requests_received, 1); + *created = 1; + return srpc; + +error: + homa_bucket_unlock(bucket, id); + if (srpc && srpc->peer) + homa_peer_release(srpc->peer); + kfree(srpc); + return ERR_PTR(err); +} + +/** + * homa_rpc_acked() - This function is invoked when an ack is received + * for an RPC; if the RPC still exists, is freed. + * @hsk: Socket on which the ack was received. May or may not correspond + * to the RPC, but can sometimes be used to avoid a socket lookup. + * @saddr: Source address from which the act was received (the client + * node for the RPC) + * @ack: Information about an RPC from @saddr that may now be deleted + * safely. + */ +void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, + struct homa_ack *ack) +{ + u16 server_port = ntohs(ack->server_port); + u64 id = homa_local_id(ack->client_id); + struct homa_sock *hsk2 = hsk; + struct homa_rpc *rpc; + + UNIT_LOG("; ", "ack %llu", id); + if (hsk->port != server_port) { + /* Without RCU, sockets other than hsk can be deleted + * out from under us. + */ + hsk2 = homa_sock_find(hsk->hnet, server_port); + if (!hsk2) + return; + } + rpc = homa_rpc_find_server(hsk2, saddr, id); + if (rpc) { + tt_record1("homa_rpc_acked freeing id %d", rpc->id); + homa_rpc_end(rpc); + homa_rpc_unlock(rpc); /* Locked by homa_rpc_find_server. */ + } + if (hsk->port != server_port) + sock_put(&hsk2->sock); +} + +/** + * homa_rpc_end() - Stop all activity on an RPC and begin the process of + * releasing its resources; this process will continue in the background + * until homa_rpc_reap eventually completes it. + * @rpc: Structure to clean up, or NULL. Must be locked. Its socket must + * not be locked. Once this function returns the caller should not + * use the RPC except to unlock it. + */ +void homa_rpc_end(struct homa_rpc *rpc) + __must_hold(rpc->bucket->lock) +{ + /* The goal for this function is to make the RPC inaccessible, + * so that no other code will ever access it again. However, don't + * actually release resources or tear down the internal structure + * of the RPC; leave that to homa_rpc_reap, which runs later. There + * are two reasons for this. First, releasing resources may be + * expensive, so we don't want to keep the caller waiting; homa_rpc_reap + * will run in situations where there is time to spare. Second, there + * may be other code that currently has pointers to this RPC but + * temporarily released the lock (e.g. to copy data to/from user space). + * It isn't safe to clean up until that code has finished its work and + * released any pointers to the RPC (homa_rpc_reap will ensure that + * this has happened). So, this function should only make changes + * needed to make the RPC inaccessible. + */ + if (!rpc || rpc->state == RPC_DEAD) + return; + UNIT_LOG("; ", "homa_rpc_end invoked"); + tt_record2("homa_rpc_end invoked for id %d, port %d", rpc->id, + rpc->hsk->port); + rpc->state = RPC_DEAD; + rpc->error = -EINVAL; + +#ifndef __STRIP__ /* See strip.py */ + /* The following line must occur before the socket is locked. This is + * necessary because homa_grant_end_rpc releases the RPC lock and + * reacquires it. + */ + if (rpc->msgin.length >= 0) + homa_grant_end_rpc(rpc); +#endif /* See strip.py */ + + /* Unlink from all lists, so no-one will ever find this RPC again. */ + homa_sock_lock(rpc->hsk); + __hlist_del(&rpc->hash_links); + list_del_rcu(&rpc->active_links); + list_add_tail(&rpc->dead_links, &rpc->hsk->dead_rpcs); + __list_del_entry(&rpc->ready_links); + __list_del_entry(&rpc->buf_links); + homa_interest_notify_private(rpc); +// tt_record3("Freeing rpc id %d, socket %d, dead_skbs %d", rpc->id, +// rpc->hsk->client_port, +// rpc->hsk->dead_skbs); + + if (rpc->msgin.length >= 0) { + rpc->hsk->dead_skbs += skb_queue_len(&rpc->msgin.packets); + while (1) { + struct homa_gap *gap; + + gap = list_first_entry_or_null(&rpc->msgin.gaps, + struct homa_gap, links); + if (!gap) + break; + list_del(&gap->links); + kfree(gap); + } + } + rpc->hsk->dead_skbs += rpc->msgout.num_skbs; + if (rpc->hsk->dead_skbs > rpc->hsk->homa->max_dead_buffs) + /* This update isn't thread-safe; it's just a + * statistic so it's OK if updates occasionally get + * missed. + */ + rpc->hsk->homa->max_dead_buffs = rpc->hsk->dead_skbs; + + homa_sock_unlock(rpc->hsk); + IF_NO_STRIP(homa_pacer_unmanage_rpc(rpc)); +} + +/** + * homa_rpc_abort() - Terminate an RPC. + * @rpc: RPC to be terminated. Must be locked by caller. + * @error: A negative errno value indicating the error that caused the abort. + * If this is a client RPC, the error will be returned to the + * application; if it's a server RPC, the error is ignored and + * we just free the RPC. + */ +void homa_rpc_abort(struct homa_rpc *rpc, int error) + __must_hold(rpc->bucket->lock) +{ + if (!homa_is_client(rpc->id)) { + INC_METRIC(server_rpc_discards, 1); + tt_record3("aborting server RPC: peer 0x%x, id %d, error %d", + tt_addr(rpc->peer->addr), rpc->id, error); + homa_rpc_end(rpc); + return; + } + tt_record3("aborting client RPC: peer 0x%x, id %d, error %d", + tt_addr(rpc->peer->addr), rpc->id, error); + rpc->error = error; + homa_rpc_handoff(rpc); +} + +/** + * homa_abort_rpcs() - Abort all RPCs to/from a particular peer. + * @homa: Overall data about the Homa protocol implementation. + * @addr: Address (network order) of the destination whose RPCs are + * to be aborted. + * @port: If nonzero, then RPCs will only be aborted if they were + * targeted at this server port. + * @error: Negative errno value indicating the reason for the abort. + */ +void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, + int port, int error) +{ + struct homa_socktab_scan scan; + struct homa_sock *hsk; + struct homa_rpc *rpc; + + for (hsk = homa_socktab_start_scan(homa->socktab, &scan); hsk; + hsk = homa_socktab_next(&scan)) { + /* Skip the (expensive) lock acquisition if there's no + * work to do. + */ + if (list_empty(&hsk->active_rpcs)) + continue; + if (!homa_protect_rpcs(hsk)) + continue; + rcu_read_lock(); + list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { + if (!ipv6_addr_equal(&rpc->peer->addr, addr)) + continue; + if (port && rpc->dport != port) + continue; + homa_rpc_lock(rpc); + homa_rpc_abort(rpc, error); + homa_rpc_unlock(rpc); + } + rcu_read_unlock(); + homa_unprotect_rpcs(hsk); + } + homa_socktab_end_scan(&scan); +} + +/** + * homa_rpc_reap() - Invoked to release resources associated with dead + * RPCs for a given socket. + * @hsk: Homa socket that may contain dead RPCs. Must not be locked by the + * caller; this function will lock and release. + * @reap_all: False means do a small chunk of work; there may still be + * unreaped RPCs on return. True means reap all dead RPCs for + * hsk. Will busy-wait if reaping has been disabled for some RPCs. + * + * Return: A return value of 0 means that we ran out of work to do; calling + * again will do no work (there could be unreaped RPCs, but if so, + * they cannot currently be reaped). A value greater than zero means + * there is still more reaping work to be done. + */ +int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) +{ + /* RPC Reaping Strategy: + * + * (Note: there are references to this comment elsewhere in the + * Homa code) + * + * Most of the cost of reaping comes from freeing sk_buffs; this can be + * quite expensive for RPCs with long messages. + * + * The natural time to reap is when homa_rpc_end is invoked to + * terminate an RPC, but this doesn't work for two reasons. First, + * there may be outstanding references to the RPC; it cannot be reaped + * until all of those references have been released. Second, reaping + * is potentially expensive and RPC termination could occur in + * homa_softirq when there are short messages waiting to be processed. + * Taking time to reap a long RPC could result in significant delays + * for subsequent short RPCs. + * + * Thus Homa doesn't reap immediately in homa_rpc_end. Instead, dead + * RPCs are queued up and reaping occurs in this function, which is + * invoked later when it is less likely to impact latency. The + * challenge is to do this so that (a) we don't allow large numbers of + * dead RPCs to accumulate and (b) we minimize the impact of reaping + * on latency. + * + * The primary place where homa_rpc_reap is invoked is when threads + * are waiting for incoming messages. The thread has nothing else to + * do (it may even be polling for input), so reaping can be performed + * with no latency impact on the application. However, if a machine + * is overloaded then it may never wait, so this mechanism isn't always + * sufficient. + * + * Homa now reaps in two other places, if reaping while waiting for + * messages isn't adequate: + * 1. If too may dead skbs accumulate, then homa_timer will call + * homa_rpc_reap. + * 2. If this timer thread cannot keep up with all the reaping to be + * done then as a last resort homa_dispatch_pkts will reap in small + * increments (a few sk_buffs or RPCs) for every incoming batch + * of packets . This is undesirable because it will impact Homa's + * performance. + * + * During the introduction of homa_pools for managing input + * buffers, freeing of packets for incoming messages was moved to + * homa_copy_to_user under the assumption that this code wouldn't be + * on the critical path. However, there is evidence that with + * fast networks (e.g. 100 Gbps) copying to user space is the + * bottleneck for incoming messages, and packet freeing takes about + * 20-25% of the total time in homa_copy_to_user. So, it may eventually + * be desirable to remove packet freeing out of homa_copy_to_user. + */ +#ifdef __UNIT_TEST__ +#define BATCH_MAX 3 +#else /* __UNIT_TEST__ */ +#define BATCH_MAX 10 +#endif /* __UNIT_TEST__ */ + struct homa_rpc *rpcs[BATCH_MAX]; + struct sk_buff *skbs[BATCH_MAX]; + int num_skbs, num_rpcs; + bool checked_all_rpcs; + struct homa_rpc *rpc; + struct homa_rpc *tmp; + int i, batch_size; + int skbs_to_reap; + int rx_frees; + + INC_METRIC(reaper_calls, 1); + INC_METRIC(reaper_dead_skbs, hsk->dead_skbs); + + /* Each iteration through the following loop will reap + * BATCH_MAX skbs. + */ + skbs_to_reap = hsk->homa->reap_limit; + checked_all_rpcs = list_empty(&hsk->dead_rpcs); + while (!checked_all_rpcs) { + batch_size = BATCH_MAX; + if (!reap_all) { + if (skbs_to_reap <= 0) + break; + if (batch_size > skbs_to_reap) + batch_size = skbs_to_reap; + skbs_to_reap -= batch_size; + } + num_skbs = 0; + num_rpcs = 0; + rx_frees = 0; + + homa_sock_lock(hsk); + if (atomic_read(&hsk->protect_count)) { + INC_METRIC(disabled_reaps, 1); + tt_record3("homa_rpc_reap returning for port %d: protect_count %d, dead_skbs %d", + hsk->port, atomic_read(&hsk->protect_count), + hsk->dead_skbs); + homa_sock_unlock(hsk); + return 0; + } + + /* Collect buffers and freeable RPCs. */ + list_for_each_entry_safe(rpc, tmp, &hsk->dead_rpcs, + dead_links) { + int refs; + + /* Make sure that all outstanding uses of the RPC have + * completed. We can read the reference count safely + * only when we're holding the lock. Note: it isn't + * safe to block while locking the RPC here, since we + * hold the socket lock. + */ + if (homa_rpc_try_lock(rpc)) { + refs = refcount_read(&rpc->refs); + homa_rpc_unlock(rpc); + } else { + refs = 2; + } + if (refs > 1) { + INC_METRIC(deferred_rpc_reaps, 1); + continue; + } + + /* For Tx sk_buffs, collect them here but defer + * freeing until after releasing the socket lock. + */ + if (rpc->msgout.length >= 0) { + while (1) { + struct sk_buff *skb; + + skb = rpc->msgout.to_free; + if (!skb) { + skb = rpc->msgout.packets; + if (!skb) + break; + rpc->msgout.to_free = skb; + rpc->msgout.packets = NULL; + } + +#ifndef __STRIP__ /* See strip.py */ + /* This tests whether skb is still in a + * transmit queue somewhere; if so, + * can't reap the RPC since homa_qdisc + * may try to access the RPC via the + * skb's homa_skb_info. + */ +#else /* See strip.py */ + /* Don't reap RPC if anyone besides + * us has a reference to the skb. + */ +#endif /* See strip.py */ + if (refcount_read(&skb->users) > 1) { + INC_METRIC(reaper_active_skbs, + 1); + goto next_rpc; + } + skbs[num_skbs] = skb; + rpc->msgout.to_free = + homa_get_skb_info(skb)->next_skb; + num_skbs++; + rpc->msgout.num_skbs--; + if (num_skbs >= batch_size) + goto release; + } + } + + /* In the normal case rx sk_buffs will already have been + * freed before we got here. Thus it's OK to free + * immediately in rare situations where there are + * buffers left. + */ + if (rpc->msgin.length >= 0 && + !skb_queue_empty_lockless(&rpc->msgin.packets)) { + rx_frees += skb_queue_len(&rpc->msgin.packets); + __skb_queue_purge_reason(&rpc->msgin.packets, + SKB_CONSUMED); + } + + /* If we get here, it means all packets have been + * removed from the RPC. + */ + rpcs[num_rpcs] = rpc; + num_rpcs++; + list_del(&rpc->dead_links); + WARN_ON(refcount_sub_and_test(rpc->msgout.skb_memory, + &hsk->sock.sk_wmem_alloc)); + if (num_rpcs >= batch_size) + goto release; + +next_rpc: + continue; + } + checked_all_rpcs = true; + + /* Free all of the collected resources; release the socket + * lock while doing this. + */ +release: + hsk->dead_skbs -= num_skbs + rx_frees; + homa_sock_unlock(hsk); + homa_skb_free_many_tx(hsk->homa, skbs, num_skbs); + for (i = 0; i < num_rpcs; i++) { + IF_NO_STRIP(int tx_left); + + rpc = rpcs[i]; + + UNIT_LOG("; ", "reaped %llu", rpc->id); + if (unlikely(rpc->msgin.num_bpages)) + homa_pool_release_buffers(rpc->hsk->buffer_pool, + rpc->msgin.num_bpages, + rpc->msgin.bpage_offsets); + if (rpc->msgin.length >= 0) { + while (1) { + struct homa_gap *gap; + + gap = list_first_entry_or_null( + &rpc->msgin.gaps, + struct homa_gap, + links); + if (!gap) + break; + list_del(&gap->links); + kfree(gap); + } + } + if (rpc->peer) { + homa_peer_release(rpc->peer); + rpc->peer = NULL; + } + tt_record2("homa_rpc_reap finished reaping id %d, port %d", + rpc->id, rpc->hsk->port); +#ifndef __STRIP__ /* See strip.py */ + + tx_left = rpc->msgout.length - + rpc->msgout.next_xmit_offset; + if (homa_is_client(rpc->id)) { + INC_METRIC(client_response_bytes_done, + rpc->msgin.bytes_remaining); + INC_METRIC(client_responses_done, + rpc->msgin.bytes_remaining != 0); + if (tx_left > 0) { + INC_METRIC(client_request_bytes_done, + tx_left); + INC_METRIC(client_requests_done, 1); + } + } else { + INC_METRIC(server_request_bytes_done, + rpc->msgin.bytes_remaining); + INC_METRIC(server_requests_done, + rpc->msgin.bytes_remaining != 0); + if (tx_left > 0) { + INC_METRIC(server_response_bytes_done, + tx_left); + INC_METRIC(server_responses_done, 1); + } + } +#endif /* See strip.py */ + rpc->state = 0; + rpc->magic = 0; + kfree(rpc); + } + homa_sock_wakeup_wmem(hsk); + tt_record4("reaped %d skbs, %d rpcs; %d skbs remain for port %d", + num_skbs + rx_frees, num_rpcs, hsk->dead_skbs, + hsk->port); + } + homa_pool_check_waiting(hsk->buffer_pool); + return !checked_all_rpcs; +} + +/** + * homa_abort_sock_rpcs() - Abort all outgoing (client-side) RPCs on a given + * socket. + * @hsk: Socket whose RPCs should be aborted. + * @error: Zero means that the aborted RPCs should be freed immediately. + * A nonzero value means that the RPCs should be marked + * complete, so that they can be returned to the application; + * this value (a negative errno) will be returned from + * recvmsg. + */ +void homa_abort_sock_rpcs(struct homa_sock *hsk, int error) +{ + struct homa_rpc *rpc; + + if (list_empty(&hsk->active_rpcs)) + return; + if (!homa_protect_rpcs(hsk)) + return; + rcu_read_lock(); + list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { + if (!homa_is_client(rpc->id)) + continue; + homa_rpc_lock(rpc); + if (rpc->state == RPC_DEAD) { + homa_rpc_unlock(rpc); + continue; + } + tt_record4("homa_abort_sock_rpcs aborting id %u on port %d, peer 0x%x, error %d", + rpc->id, hsk->port, + tt_addr(rpc->peer->addr), error); + if (error) + homa_rpc_abort(rpc, error); + else + homa_rpc_end(rpc); + homa_rpc_unlock(rpc); + } + rcu_read_unlock(); + homa_unprotect_rpcs(hsk); +} + +/** + * homa_rpc_find_client() - Locate client-side information about the RPC that + * a packet belongs to, if there is any. Thread-safe without socket lock. + * @hsk: Socket via which packet was received. + * @id: Unique identifier for the RPC. + * + * Return: A pointer to the homa_rpc for this id, or NULL if none. + * The RPC will be locked; the caller must eventually unlock it + * by invoking homa_rpc_unlock. + */ +struct homa_rpc *homa_rpc_find_client(struct homa_sock *hsk, u64 id) + __cond_acquires(crpc->bucket->lock) +{ + struct homa_rpc_bucket *bucket = homa_client_rpc_bucket(hsk, id); + struct homa_rpc *crpc; + + homa_bucket_lock(bucket, id); + hlist_for_each_entry(crpc, &bucket->rpcs, hash_links) { + if (crpc->id == id) + return crpc; + } + homa_bucket_unlock(bucket, id); + return NULL; +} + +/** + * homa_rpc_find_server() - Locate server-side information about the RPC that + * a packet belongs to, if there is any. Thread-safe without socket lock. + * @hsk: Socket via which packet was received. + * @saddr: Address from which the packet was sent. + * @id: Unique identifier for the RPC (must have server bit set). + * + * Return: A pointer to the homa_rpc matching the arguments, or NULL + * if none. The RPC will be locked; the caller must eventually + * unlock it by invoking homa_rpc_unlock. + */ +struct homa_rpc *homa_rpc_find_server(struct homa_sock *hsk, + const struct in6_addr *saddr, u64 id) + __cond_acquires(srpc->bucket->lock) +{ + struct homa_rpc_bucket *bucket = homa_server_rpc_bucket(hsk, id); + struct homa_rpc *srpc; + + homa_bucket_lock(bucket, id); + hlist_for_each_entry(srpc, &bucket->rpcs, hash_links) { + if (srpc->id == id && ipv6_addr_equal(&srpc->peer->addr, saddr)) + return srpc; + } + homa_bucket_unlock(bucket, id); + return NULL; +} + +/** + * homa_rpc_get_info() - Extract information from an RPC for returning to + * an application via the HOMAIOCINFO ioctl. + * @rpc: RPC for which information is desired. + * @info: Structure in which to store the information. + */ +void homa_rpc_get_info(struct homa_rpc *rpc, struct homa_rpc_info *info) +{ + struct homa_gap *gap; + + memset(info, 0, sizeof(*info)); + info->id = rpc->id; + if (rpc->hsk->inet.sk.sk_family == AF_INET6) { + info->peer.in6.sin6_family = AF_INET6; + info->peer.in6.sin6_addr = rpc->peer->addr; + info->peer.in6.sin6_port = htons(rpc->dport); + } else { + info->peer.in6.sin6_family = AF_INET; + info->peer.in4.sin_addr.s_addr = ipv6_to_ipv4(rpc->peer->addr); + info->peer.in4.sin_port = htons(rpc->dport); + } + info->completion_cookie = rpc->completion_cookie; + if (rpc->msgout.length >= 0) { + info->tx_length = rpc->msgout.length; + info->tx_sent = rpc->msgout.next_xmit_offset; +#ifndef __STRIP__ /* See strip.py */ + info->tx_granted = rpc->msgout.granted; + info->tx_prio = rpc->msgout.sched_priority; +#else /* See strip.py */ + info->tx_granted = rpc->msgout.length; +#endif /* See strip.py */ + } else { + info->tx_length = -1; + } + if (rpc->msgin.length >= 0) { + info->rx_length = rpc->msgin.length; + info->rx_remaining = rpc->msgin.bytes_remaining; + list_for_each_entry(gap, &rpc->msgin.gaps, links) { + info->rx_gaps++; + info->rx_gap_bytes += gap->end - gap->start; + } +#ifndef __STRIP__ /* See strip.py */ + info->rx_granted = rpc->msgin.granted; +#else /* See strip.py */ + info->rx_granted = rpc->msgin.length; +#endif /* See strip.py */ + if (skb_queue_len(&rpc->msgin.packets) > 0) + info->flags |= HOMA_RPC_RX_COPY; + } else { + info->rx_length = -1; + } + if (!list_empty(&rpc->buf_links)) + info->flags |= HOMA_RPC_BUF_STALL; + if (!list_empty(&rpc->ready_links) && + rpc->msgin.bytes_remaining == 0 && + skb_queue_len(&rpc->msgin.packets) == 0) + info->flags |= HOMA_RPC_RX_READY; + if (rpc->flags & RPC_PRIVATE) + info->flags |= HOMA_RPC_PRIVATE; +} diff --git a/homa_rpc.h b/homa_rpc.h new file mode 100644 index 00000000..16750746 --- /dev/null +++ b/homa_rpc.h @@ -0,0 +1,634 @@ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ + +/* This file defines homa_rpc and related structs. */ + +#ifndef _HOMA_RPC_H +#define _HOMA_RPC_H + +#include +#include +#include + +#include "homa_sock.h" +#include "homa_wire.h" + +/* Forward references. */ +struct homa_ack; + +/** + * struct homa_message_out - Describes a message (either request or response) + * for which this machine is the sender. + */ +struct homa_message_out { + /** + * @length: Total bytes in message (excluding headers). A value + * less than 0 means this structure is uninitialized and therefore + * not in use (all other fields will be zero in this case). + */ + int length; + + /** @num_skbs: Total number of buffers currently in @to_free. */ + int num_skbs; + + /** + * @skb_memory: Total number of bytes of memory occupied by + * the sk_buffs for this message. + */ + int skb_memory; + + /** + * @copied_from_user: Number of bytes of the message that have + * been copied from user space into skbs in @packets. + */ + int copied_from_user; + + /** + * @packets: Singly-linked list of all packets in message, linked + * using homa_skb_info->next_skb. The list is in order of offset in + * the message (offset 0 first); each sk_buff can potentially contain + * multiple data_segments, which will be split into separate packets + * by GSO. This list grows gradually as data is copied in from user + * space, so it may not be complete. + */ + struct sk_buff *packets; + + /** + * @next_xmit: Pointer to pointer to next packet to transmit (will + * either refer to @packets or homa_skb_info->next_skb for some skb + * in @packets). + */ + struct sk_buff **next_xmit; + + /** + * @next_xmit_offset: All bytes in the message, up to but not + * including this one, have been passed to ip_queue_xmit or + * ip6_xmit. + */ + int next_xmit_offset; + + /** + * @first_not_tx: All packets in @packets preceding this one have + * been confirmed to have been transmitted by the NIC (the driver + * has released its reference). NULL means all packets are known to + * have been transmitted. Used by homa_rpc_tx_end. + */ + struct sk_buff *first_not_tx; + + /** + * @to_free: Singly-linked list of packets that must be freed by + * homa_rpc_reap. Initially holds retransmitted packets, but + * eventually includes the packets in @packets. homa_rpc_reap uses + * this list to ensure that all tx packets have been freed by the + * IP stack before it frees the homa_rpc (otherwise homa_qdisc might + * try to access the RPC via a packet's homa_skb_info). Note: I + * considered using skb->destructor to release a reference on the RPC, + * but this does not appear to be reliable because (a) skb->destructor + * may be overwritten and (b) it may be called before the skb has + * cleared the tx pipeline (via skb_orphan?). Also, need to retain + * @packets in case they are needed for retransmission. + */ + struct sk_buff *to_free; + +#ifndef __STRIP__ /* See strip.py */ + /** + * @unscheduled: Initial bytes of message that we'll send + * without waiting for grants. + */ + int unscheduled; + + /** + * @granted: Total number of bytes we are currently permitted to + * send, including unscheduled bytes; must wait for grants before + * sending bytes at or beyond this position. Never larger than + * @length. + */ + int granted; + + /** + * @sched_priority: Priority level to use for future scheduled + * packets. + */ + u8 sched_priority; +#endif /* See strip.py */ + + /** + * @init_time: homa_clock() time when this structure was initialized. + * Used to find the oldest outgoing message. + */ + u64 init_time; +}; + +/** + * struct homa_gap - Represents a range of bytes within a message that have + * not yet been received. + */ +struct homa_gap { + /** @start: offset of first byte in this gap. */ + int start; + + /** @end: offset of byte just after last one in this gap. */ + int end; + + /** + * @time: homa_clock() time when the gap was first detected. + * As of 7/2024 this isn't used for anything. + */ + u64 time; + + /** @links: for linking into list in homa_message_in. */ + struct list_head links; +}; + +/** + * struct homa_message_in - Holds the state of a message received by + * this machine; used for both requests and responses. + */ +struct homa_message_in { + /** + * @length: Payload size in bytes. -1 means this structure is + * uninitialized and therefore not in use. + */ + int length; + + /** + * @packets: DATA packets for this message that have been received but + * not yet copied to user space (ordered by increasing offset). The + * lock in this structure is not used (the RPC lock is used instead). + */ + struct sk_buff_head packets; + + /** + * @recv_end: Offset of the byte just after the highest one that + * has been received so far. + */ + int recv_end; + + /** + * @gaps: List of homa_gaps describing all of the bytes with + * offsets less than @recv_end that have not yet been received. + */ + struct list_head gaps; + + /** + * @bytes_remaining: Amount of data for this message that has + * not yet been received; will determine the message's priority. + */ + int bytes_remaining; + + /** + * @num_bpages: The number of entries in @bpage_offsets used for this + * message (0 means buffers not allocated yet). + */ + u32 num_bpages; + + /** + * @bpage_offsets: Describes buffer space allocated for this message. + * Each entry is an offset from the start of the buffer region. + * All but the last pointer refer to areas of size HOMA_BPAGE_SIZE. + */ + u32 bpage_offsets[HOMA_MAX_BPAGES]; + +#ifndef __STRIP__ /* See strip.py */ + /** + * @rank: Position of this RPC in homa->grant->active_rpcs, or -1 + * if not in homa->grant->active_rpcs. Managed by homa_grant.c; + * unsafe to access unless holding homa->grant->lock. + */ + int rank; + + /** + * @granted: Total # of bytes (starting from offset 0) that the sender + * will transmit without additional grants, including unscheduled bytes. + * Never larger than @length. Managed by homa_grant.c. + */ + int granted; + + /** + * @prev_grant: Offset in the last GRANT packet sent for this RPC + * (initially set to unscheduled bytes). + */ + int prev_grant; + + /** + * @rec_incoming: Number of bytes in homa->total_incoming currently + * contributed ("recorded") from this RPC. Managed by homa_grant.c. + */ + int rec_incoming; + + /** + * @birth: homa_clock() time when this structure was initialized + * (i.e. first data packet was received for message). + */ + u64 birth; +#endif /* See strip.py */ +}; + +#ifndef __STRIP__ /* See strip.py */ +/** + * struct homa_rpc_qdisc - Information that homa_qdisc needs to store in + * each RPC. Managed entirely by homa_qdisc. + */ +struct homa_rpc_qdisc { + /** + * @packets: List of tx skbs from this RPC that have been deferred + * by homa_qdisc. Non-empty means this RPC is currently linked into + * homa_qdisc_dev->deferred_rpcs. + */ + struct sk_buff_head packets; + + /** + * @rb_node: Used to link this struct into + * homa_qdisc_dev->deferred_rpcs. + */ + struct rb_node rb_node; + + /** + * @tx_left: The number of (trailing) bytes of the tx message + * that have not been transmitted by homa_qdisc yet. Only updated + * when packets are added to or removed from the deferred list; + * may be out of date (too high) if packets have been transmitted + * without being deferred. + */ + int tx_left; +}; +#endif /* See strip.py */ + +/** + * struct homa_rpc - One of these structures exists for each active + * RPC. The same structure is used to manage both outgoing RPCs on + * clients and incoming RPCs on servers. + */ +struct homa_rpc { + /** @hsk: Socket that owns the RPC. */ + struct homa_sock *hsk; + + /** + * @bucket: Pointer to the bucket in hsk->client_rpc_buckets or + * hsk->server_rpc_buckets where this RPC is linked. Used primarily + * for locking the RPC (which is done by locking its bucket). + */ + struct homa_rpc_bucket *bucket; + + /** + * @state: The current state of this RPC: + * + * @RPC_OUTGOING: The RPC is waiting for @msgout to be transmitted + * to the peer. + * @RPC_INCOMING: The RPC is waiting for data @msgin to be received + * from the peer; at least one packet has already + * been received. + * @RPC_IN_SERVICE: Used only for server RPCs: the request message + * has been read from the socket, but the response + * message has not yet been presented to the kernel. + * @RPC_DEAD: RPC has been deleted and is waiting to be + * reaped. In some cases, information in the RPC + * structure may be accessed in this state. + * + * Client RPCs pass through states in the following order: + * RPC_OUTGOING, RPC_INCOMING, RPC_DEAD. + * + * Server RPCs pass through states in the following order: + * RPC_INCOMING, RPC_IN_SERVICE, RPC_OUTGOING, RPC_DEAD. + */ + enum { + RPC_OUTGOING = 5, + RPC_INCOMING = 6, + RPC_IN_SERVICE = 8, + RPC_DEAD = 9 + } state; + + /** + * @flags: Additional state information: an OR'ed combination of + * various single-bit flags. See below for definitions. Must be + * manipulated with atomic operations because some of the manipulations + * occur without holding the RPC lock. + */ + unsigned long flags; + + /* Valid bit numbers for @flags: + * RPC_PKTS_READY - The RPC has input packets ready to be + * copied to user space. + * APP_NEEDS_LOCK - Means that code in the application thread + * needs the RPC lock (e.g. so it can start + * copying data to user space) so others + * (e.g. SoftIRQ processing) should relinquish + * the lock ASAP. Without this, SoftIRQ can + * lock out the application for a long time, + * preventing data copies to user space from + * starting (and they limit throughput at + * high network speeds). + * RPC_PRIVATE - This RPC will be waited on in "private" mode, + * where the app explicitly requests the + * response from this particular RPC. + */ +#define RPC_PKTS_READY 0 +#define APP_NEEDS_LOCK 1 +#define RPC_PRIVATE 2 + + /** + * @refs: Number of references to this RPC, including one for each + * unmatched call to homa_rpc_hold plus one for the socket's reference + * in either active_rpcs or dead_rpcs. + */ + refcount_t refs; + + /** + * @peer: Information about the other machine (the server, if + * this is a client RPC, or the client, if this is a server RPC). + * If non-NULL then we own a reference on the object. + */ + struct homa_peer *peer; + + /** @dport: Port number on @peer that will handle packets. */ + u16 dport; + + /** + * @id: Unique identifier for the RPC among all those issued + * from its port. The low-order bit indicates whether we are + * server (1) or client (0) for this RPC. + */ + u64 id; + + /** + * @completion_cookie: Only used on clients. Contains identifying + * information about the RPC provided by the application; returned to + * the application with the RPC's result. + */ + u64 completion_cookie; + + /** + * @error: Only used on clients. If nonzero, then the RPC has + * failed and the value is a negative errno that describes the + * problem. + */ + int error; + + /** + * @msgin: Information about the message we receive for this RPC + * (for server RPCs this is the request, for client RPCs this is the + * response). + */ + struct homa_message_in msgin; + + /** + * @msgout: Information about the message we send for this RPC + * (for client RPCs this is the request, for server RPCs this is the + * response). + */ + struct homa_message_out msgout; + +#ifndef __STRIP__ /* See strip.py */ + /** @qrpc: Information managed by homa_qdisc for this RPC. */ + struct homa_rpc_qdisc qrpc; +#endif /* See strip.py */ + + /** + * @hash_links: Used to link this object into a hash bucket for + * either @hsk->client_rpc_buckets (for a client RPC), or + * @hsk->server_rpc_buckets (for a server RPC). + */ + struct hlist_node hash_links; + + /** + * @ready_links: Used to link this object into @hsk->ready_rpcs. + */ + struct list_head ready_links; + + /** + * @buf_links: Used to link this RPC into @hsk->waiting_for_bufs. + * If the RPC isn't on @hsk->waiting_for_bufs, this is an empty + * list pointing to itself. + */ + struct list_head buf_links; + + /** + * @active_links: For linking this object into @hsk->active_rpcs. + * The next field will be LIST_POISON1 if this RPC hasn't yet been + * linked into @hsk->active_rpcs. Access with RCU. + */ + struct list_head active_links; + + /** @dead_links: For linking this object into @hsk->dead_rpcs. */ + struct list_head dead_links; + + /** + * @private_interest: If there is a thread waiting for this RPC in + * homa_wait_private, then this points to that thread's interest. + */ + struct homa_interest *private_interest; + +#ifndef __STRIP__ /* See strip.py */ + /** + * @grantable_links: Used to link this RPC into peer->grantable_rpcs. + * If this RPC isn't in peer->grantable_rpcs, this is an empty + * list pointing to itself. Must hold homa->grant->lock when + * accessing. + */ + struct list_head grantable_links; +#endif /* See strip.py */ + + /** + * @throttled_links: Used to link this RPC into + * homa->pacer.throttled_rpcs. If this RPC isn't in + * homa->pacer.throttled_rpcs, this is an empty + * list pointing to itself. + */ + struct list_head throttled_links; + + /** + * @silent_ticks: Number of times homa_timer has been invoked + * since the last time a packet indicating progress was received + * for this RPC, so we don't need to send a resend for a while. + */ + int silent_ticks; + + /** + * @resend_timer_ticks: Value of homa->timer_ticks the last time + * we sent a RESEND for this RPC. + */ + u32 resend_timer_ticks; + + /** + * @done_timer_ticks: The value of homa->timer_ticks the first + * time we noticed that this (server) RPC is done (all response + * packets have been transmitted), so we're ready for an ack. + * Zero means we haven't reached that point yet. + */ + u32 done_timer_ticks; + + /** + * @magic: when the RPC is alive, this holds a distinct value that + * is unlikely to occur naturally. The value is cleared when the + * RPC is reaped, so we can detect accidental use of an RPC after + * it has been reaped. + */ +#define HOMA_RPC_MAGIC 0xdeadbeef + int magic; + + /** + * @start_time: homa_clock() time when this RPC was created. Used + * occasionally for testing. + */ + u64 start_time; +}; + +void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, + int port, int error); +void homa_abort_sock_rpcs(struct homa_sock *hsk, int error); +void homa_rpc_abort(struct homa_rpc *crpc, int error); +void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, + struct homa_ack *ack); +struct homa_rpc + *homa_rpc_alloc_client(struct homa_sock *hsk, + const union sockaddr_in_union *dest); +struct homa_rpc + *homa_rpc_alloc_server(struct homa_sock *hsk, + const struct in6_addr *source, + struct homa_data_hdr *h, int *created); +void homa_rpc_end(struct homa_rpc *rpc); +struct homa_rpc + *homa_rpc_find_client(struct homa_sock *hsk, u64 id); +struct homa_rpc + *homa_rpc_find_server(struct homa_sock *hsk, + const struct in6_addr *saddr, u64 id); +void homa_rpc_get_info(struct homa_rpc *rpc, struct homa_rpc_info *info); +int homa_rpc_reap(struct homa_sock *hsk, bool reap_all); + +/** + * homa_rpc_lock() - Acquire the lock for an RPC. + * @rpc: RPC to lock. + */ +static inline void homa_rpc_lock(struct homa_rpc *rpc) + __acquires(rpc->bucket->lock) +{ + homa_bucket_lock(rpc->bucket, rpc->id); +} + +/** + * homa_rpc_try_lock() - Acquire the lock for an RPC if it is available. + * @rpc: RPC to lock. + * Return: Nonzero if lock was successfully acquired, zero if it is + * currently owned by someone else. + */ +static inline int homa_rpc_try_lock(struct homa_rpc *rpc) + __cond_acquires(rpc->bucket->lock) +{ + if (!spin_trylock_bh(&rpc->bucket->lock)) + return 0; + return 1; +} + +/** + * homa_rpc_lock_preempt() - Same as homa_rpc_lock, except sets the + * APP_NEEDS_LOCK flags while waiting to encourage the existing lock + * owner to relinquish the lock. + * @rpc: RPC to lock. + */ +static inline void homa_rpc_lock_preempt(struct homa_rpc *rpc) + __acquires(rpc->bucket->lock) +{ + set_bit(APP_NEEDS_LOCK, &rpc->flags); + homa_bucket_lock(rpc->bucket, rpc->id); + clear_bit(APP_NEEDS_LOCK, &rpc->flags); +} + +/** + * homa_rpc_unlock() - Release the lock for an RPC. + * @rpc: RPC to unlock. + */ +static inline void homa_rpc_unlock(struct homa_rpc *rpc) + __releases(rpc->bucket->lock) +{ + homa_bucket_unlock(rpc->bucket, rpc->id); +} + +/** + * homa_protect_rpcs() - Ensures that no RPCs will be reaped for a given + * socket until homa_sock_unprotect is called. Typically used by functions + * that want to scan the active RPCs for a socket without holding the socket + * lock. Multiple calls to this function may be in effect at once. See + * "Homa Locking Strategy" in homa_impl.h for more info on why this function + * is needed. + * @hsk: Socket whose RPCs should be protected. Must not be locked + * by the caller; will be locked here. + * + * Return: 1 for success, 0 if the socket has been shutdown, in which + * case its RPCs cannot be protected. + */ +static inline int homa_protect_rpcs(struct homa_sock *hsk) +{ + int result; + + homa_sock_lock(hsk); + result = !hsk->shutdown; + if (result) + atomic_inc(&hsk->protect_count); + homa_sock_unlock(hsk); + return result; +} + +/** + * homa_unprotect_rpcs() - Cancel the effect of a previous call to + * homa_sock_protect(), so that RPCs can once again be reaped. + * @hsk: Socket whose RPCs should be unprotected. + */ +static inline void homa_unprotect_rpcs(struct homa_sock *hsk) +{ + atomic_dec(&hsk->protect_count); +} + +#ifndef __UNIT_TEST__ +/** + * homa_rpc_hold() - Increment the reference count on an RPC, which will + * prevent it from being freed until homa_rpc_put() is called. References + * are taken in two situations: + * 1. An RPC is going to be manipulated by a collection of functions. In + * this case the top-most function that identifies the RPC takes the + * reference; any function that receives an RPC as an argument can + * assume that a reference has been taken on the RPC by some higher + * function on the call stack. + * 2. A pointer to an RPC is stored in an object for use later, such as + * an interest. A reference must be held as long as the pointer remains + * accessible in the object. + * @rpc: RPC on which to take a reference. + */ +static inline void homa_rpc_hold(struct homa_rpc *rpc) +{ + refcount_inc(&rpc->refs); +} + +/** + * homa_rpc_put() - Release a reference on an RPC (cancels the effect of + * a previous call to homa_rpc_put). + * @rpc: RPC to release. + */ +static inline void homa_rpc_put(struct homa_rpc *rpc) +{ + refcount_dec(&rpc->refs); +} +#endif /* __UNIT_TEST__ */ + +/** + * homa_is_client(): returns true if we are the client for a particular RPC, + * false if we are the server. + * @id: Id of the RPC in question. + * Return: true if we are the client for RPC id, false otherwise + */ +static inline bool homa_is_client(u64 id) +{ + return (id & 1) == 0; +} + +/** + * homa_rpc_needs_attention() - Returns true if @rpc has failed or if + * its incoming message is ready for attention by an application thread + * (e.g., packets are ready to copy to user space). + * @rpc: RPC to check. + * Return: See above + */ +static inline bool homa_rpc_needs_attention(struct homa_rpc *rpc) +{ + return (rpc->error != 0 || test_bit(RPC_PKTS_READY, &rpc->flags)); +} + +#endif /* _HOMA_RPC_H */ diff --git a/homa_skb.c b/homa_skb.c new file mode 100644 index 00000000..58f10807 --- /dev/null +++ b/homa_skb.c @@ -0,0 +1,654 @@ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +/* This file contains functions for allocating and freeing sk_buffs for + * outbound packets. In particular, this file implements efficient management + * of the memory used by sk_buffs. + */ + +#include "homa_impl.h" +#include "homa_skb.h" + +DEFINE_PER_CPU(struct homa_skb_core, homa_skb_core); + +#ifdef __UNIT_TEST__ +extern int mock_max_skb_frags; +#define HOMA_MAX_SKB_FRAGS mock_max_skb_frags +#else +#define HOMA_MAX_SKB_FRAGS MAX_SKB_FRAGS +#endif + +static void frag_page_set(skb_frag_t *frag, struct page *page) +{ + frag->netmem = page_to_netmem(page); +} + +/** + * homa_skb_init() - Invoked when a struct homa is created to initialize + * information related to sk_buff management. + * @homa: Shared information about the Homa transport + * Return: 0 for success, negative errno on error + */ +int homa_skb_init(struct homa *homa) +{ + int i; + + spin_lock_init(&homa->page_pool_mutex); + memset(homa->page_pools, 0, sizeof(homa->page_pools)); + homa->skb_page_frees_per_sec = 1000; + homa->skb_pages_to_free = NULL; + homa->pages_to_free_slots = 0; + homa->skb_page_free_time = 0; + homa->skb_page_pool_min_kb = (3 * HOMA_MAX_MESSAGE_LENGTH) / 1000; + + /* Initialize NUMA-specfific page pools. */ + homa->max_numa = -1; + for (i = 0; i < nr_cpu_ids; i++) { + struct homa_skb_core *skb_core = &per_cpu(homa_skb_core, i); + int numa = cpu_to_node(i); + + BUG_ON(numa >= MAX_NUMNODES); + if (numa > homa->max_numa) + homa->max_numa = numa; + if (!homa->page_pools[numa]) { + struct homa_page_pool *pool; + + pool = kzalloc(sizeof(*pool), GFP_ATOMIC); + if (!pool) + return -ENOMEM; + homa->page_pools[numa] = pool; + } + skb_core->pool = homa->page_pools[numa]; + } + pr_notice("%s found max NUMA node %d\n", __func__, homa->max_numa); + return 0; +} + +/** + * homa_skb_cleanup() - Invoked when a struct homa is deleted; cleans + * up information related to skb allocation. + * @homa: Overall information about the Homa transport. + */ +void homa_skb_cleanup(struct homa *homa) +{ + int i, j; + + for (i = 0; i < nr_cpu_ids; i++) { + struct homa_skb_core *skb_core = &per_cpu(homa_skb_core, i); + + if (skb_core->skb_page) { + put_page(skb_core->skb_page); + skb_core->skb_page = NULL; + skb_core->page_size = 0; + skb_core->page_inuse = 0; + } + for (j = 0; j < skb_core->num_stashed_pages; j++) + put_page(skb_core->stashed_pages[j]); + skb_core->pool = NULL; + skb_core->num_stashed_pages = 0; + } + + for (i = 0; i < MAX_NUMNODES; i++) { + struct homa_page_pool *pool = homa->page_pools[i]; + + if (!pool) + continue; + for (j = pool->avail - 1; j >= 0; j--) + put_page(pool->pages[j]); + pool->avail = 0; + kfree(pool); + homa->page_pools[i] = NULL; + } + + kfree(homa->skb_pages_to_free); + homa->skb_pages_to_free = NULL; + homa->pages_to_free_slots = 0; +} + +/** + * homa_skb_alloc_tx() - Allocate a new sk_buff for outgoing data. + * @length: Number of bytes of data that the caller would like to + * have available in the linear part of the sk_buff for + * the Homa header and additional data beyond that. This + * function will allocate additional space for IP and + * Ethernet headers, as well as for the homa_skb_info. + * Return: New sk_buff, or NULL if there was insufficient memory. + * The sk_buff will be configured so that the next + * skb_put will be for the transport (Homa) header. The + * homa_skb_info is not initialized. + */ +struct sk_buff *homa_skb_alloc_tx(int length) +{ + u64 start = homa_clock(); + struct sk_buff *skb; + + skb = alloc_skb(HOMA_SKB_EXTRA + sizeof(struct homa_skb_info) + length, + GFP_ATOMIC); + if (likely(skb)) { + skb_reserve(skb, HOMA_SKB_EXTRA); + skb_reset_transport_header(skb); + } + INC_METRIC(skb_allocs, 1); + INC_METRIC(skb_alloc_cycles, homa_clock() - start); + return skb; +} + +/** + * homa_skb_stash_pages() - Typically invoked at the beginning of + * preparing an output message; will collect from the page cache enough + * pages to meet the needs of the message and stash them locally for this + * core, so that the global lock for the page cache only needs to be acquired + * once. + * @homa: Overall data about the Homa protocol implementation. + * @length: Length of the message being prepared. Must be <= + * HOMA_MAX_MESSAGE_LENGTH. + */ +void homa_skb_stash_pages(struct homa *homa, int length) +{ + struct homa_skb_core *skb_core = &per_cpu(homa_skb_core, + smp_processor_id()); + struct homa_page_pool *pool = skb_core->pool; + int pages_needed = HOMA_MAX_STASHED(length); + + if (pages_needed < 2 || skb_core->num_stashed_pages >= pages_needed) + return; + spin_lock_bh(&homa->page_pool_mutex); + while (pool->avail && (skb_core->num_stashed_pages < pages_needed)) { + pool->avail--; + if (pool->avail < pool->low_mark) + pool->low_mark = pool->avail; + skb_core->stashed_pages[skb_core->num_stashed_pages] = + pool->pages[pool->avail]; + skb_core->num_stashed_pages++; + } + spin_unlock_bh(&homa->page_pool_mutex); +} + +/** + * homa_skb_extend_frags() - Allocate additional space in the frags part + * of an skb (ideally by just expanding the last fragment). Returns + * one contiguous chunk, whose size is <= @length. + * @homa: Overall data about the Homa protocol implementation. + * @skb: Skbuff for which additional space is needed. + * @length: The preferred number of bytes to append; modified to hold + * the actual number allocated, which may be less. + * Return: Pointer to the new space, or NULL if space couldn't be + * allocated. + */ +void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, int *length) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + struct homa_skb_core *skb_core; + int actual_size = *length; + skb_frag_t *frag; + char *result; + + preempt_disable(); + + /* Can we just extend the skb's last fragment? */ + skb_core = &per_cpu(homa_skb_core, smp_processor_id()); + if (shinfo->nr_frags > 0) { + frag = &shinfo->frags[shinfo->nr_frags - 1]; + if (skb_frag_page(frag) == skb_core->skb_page && + skb_core->page_inuse < skb_core->page_size && + (frag->offset + skb_frag_size(frag)) == + skb_core->page_inuse) { + if ((skb_core->page_size - skb_core->page_inuse) < + actual_size) + actual_size = skb_core->page_size - + skb_core->page_inuse; + *length = actual_size; + skb_frag_size_add(frag, actual_size); + result = page_address(skb_frag_page(frag)) + + skb_core->page_inuse; + skb_core->page_inuse += actual_size; + skb_len_add(skb, actual_size); + goto done; + } + } + + /* Need to add a new fragment to the skb. */ + skb_core->page_inuse = ALIGN(skb_core->page_inuse, SMP_CACHE_BYTES); + if (skb_core->page_inuse >= skb_core->page_size) { + if (!homa_skb_page_alloc(homa, skb_core)) { + result = NULL; + goto done; + } + } + if ((skb_core->page_size - skb_core->page_inuse) < actual_size) + actual_size = skb_core->page_size - skb_core->page_inuse; + frag = &shinfo->frags[shinfo->nr_frags]; + shinfo->nr_frags++; + frag_page_set(frag, skb_core->skb_page); + get_page(skb_core->skb_page); + frag->offset = skb_core->page_inuse; + *length = actual_size; + skb_frag_size_set(frag, actual_size); + result = page_address(skb_frag_page(frag)) + skb_core->page_inuse; + skb_core->page_inuse += actual_size; + skb_len_add(skb, actual_size); + +done: + preempt_enable(); + return result; +} + +/** + * homa_skb_page_alloc() - Allocate a new page for skb allocation for a + * given core. Any existing page is released. + * @homa: Overall data about the Homa protocol implementation. + * @skb_core: Core-specific info; the page will be allocated in this core. + * Return: True if successful, false if memory not available. + */ +bool homa_skb_page_alloc(struct homa *homa, struct homa_skb_core *skb_core) +{ + struct homa_page_pool *pool; + u64 start; + + if (skb_core->skb_page) { + if (page_ref_count(skb_core->skb_page) == 1) { + /* The existing page is no longer in use, so we can + * reuse it. + */ + skb_core->page_inuse = 0; + goto success; + } + put_page(skb_core->skb_page); + } + + /* Step 1: does this core have a stashed page? */ + skb_core->page_size = HOMA_SKB_PAGE_SIZE; + skb_core->page_inuse = 0; + if (skb_core->num_stashed_pages > 0) { + skb_core->num_stashed_pages--; + skb_core->skb_page = skb_core->stashed_pages[skb_core->num_stashed_pages]; + goto success; + } + + /* Step 2: can we retrieve a page from the pool for this NUMA node? */ + pool = skb_core->pool; + if (pool->avail) { + UNIT_HOOK("skb_page_alloc_race"); + spin_lock_bh(&homa->page_pool_mutex); + + /* Must recheck: could have changed before locked. */ + if (pool->avail) { + pool->avail--; + if (pool->avail < pool->low_mark) + pool->low_mark = pool->avail; + skb_core->skb_page = pool->pages[pool->avail]; + spin_unlock_bh(&homa->page_pool_mutex); + goto success; + } + spin_unlock_bh(&homa->page_pool_mutex); + } + + /* Step 3: can we allocate a new big page? */ + INC_METRIC(skb_page_allocs, 1); + start = homa_clock(); + skb_core->skb_page = alloc_pages(GFP_ATOMIC | __GFP_COMP + | __GFP_NOWARN | __GFP_NORETRY, HOMA_SKB_PAGE_ORDER); + if (likely(skb_core->skb_page)) { + INC_METRIC(skb_page_alloc_cycles, homa_clock() - start); + goto success; + } + + /* Step 4: can we allocate a normal page? */ + skb_core->skb_page = alloc_page(GFP_ATOMIC); + INC_METRIC(skb_page_alloc_cycles, homa_clock() - start); + if (likely(skb_core->skb_page)) { + skb_core->page_size = PAGE_SIZE; + goto success; + } + skb_core->page_size = 0; + skb_core->page_inuse = 0; + return false; + +success: + return true; +} + +/** + * homa_skb_append_to_frag() - Append a block of data to an sk_buff + * by allocating new space at the end of the frags area and copying the + * data into that new space. + * @homa: Overall data about the Homa protocol implementation. + * @skb: Append to this sk_buff. + * @buf: Address of first byte of data to be appended. + * @length: Number of byte to append. + * Return: 0 or a negative errno. + */ +int homa_skb_append_to_frag(struct homa *homa, struct sk_buff *skb, void *buf, + int length) +{ + int chunk_length; + char *src = buf; + char *dst; + + while (length > 0) { + chunk_length = length; + dst = (char *)homa_skb_extend_frags(homa, skb, &chunk_length); + if (!dst) + return -ENOMEM; + memcpy(dst, src, chunk_length); + length -= chunk_length; + src += chunk_length; + } + return 0; +} + +/** + * homa_skb_append_from_iter() - Append data to an sk_buff by allocating + * new space at the end of the frags area and copying data into that space + * @homa: Overall data about the Homa protocol implementation. + * @skb: Append to this sk_buff. + * @iter: Describes location of data to append; modified to reflect + * copies data. + * @length: Number of byte to append; iter must have at least this many bytes. + * Return: 0 or a negative errno. + */ +int homa_skb_append_from_iter(struct homa *homa, struct sk_buff *skb, + struct iov_iter *iter, int length) +{ + int chunk_length; + char *dst; + + while (length > 0) { + chunk_length = length; + dst = (char *)homa_skb_extend_frags(homa, skb, &chunk_length); + if (!dst) + return -ENOMEM; + if (copy_from_iter(dst, chunk_length, iter) != chunk_length) + return -EFAULT; + length -= chunk_length; + } + return 0; +} + +/** + * homa_skb_append_from_skb() - Copy data from one skb to another. The + * data is appended into new frags at the destination. The copies are done + * virtually when possible. + * @homa: Overall data about the Homa protocol implementation. + * @dst_skb: Data gets added to the end of this skb. + * @src_skb: Data is copied out of this skb. + * @offset: Offset within @src_skb of first byte to copy, relative + * to the transport header. + * @length: Total number of bytes to copy; fewer bytes than this may + * be copied if @src_skb isn't long enough to hold all of the + * desired bytes. + * Return: 0 for success or a negative errno if an error occurred. + */ +int homa_skb_append_from_skb(struct homa *homa, struct sk_buff *dst_skb, + struct sk_buff *src_skb, int offset, int length) +{ + int src_frag_offset, src_frags_left, chunk_size, err, head_len; + struct skb_shared_info *src_shinfo = skb_shinfo(src_skb); + struct skb_shared_info *dst_shinfo = skb_shinfo(dst_skb); + skb_frag_t *src_frag, *dst_frag; + + /* Copy bytes from the linear part of the source, if any. */ + head_len = skb_tail_pointer(src_skb) - skb_transport_header(src_skb); + if (offset < head_len) { + chunk_size = length; + if (chunk_size > (head_len - offset)) + chunk_size = head_len - offset; + err = homa_skb_append_to_frag(homa, dst_skb, + skb_transport_header(src_skb) + offset, + chunk_size); + if (err) + return err; + offset += chunk_size; + length -= chunk_size; + } + + /* Virtually copy bytes from source frags, if needed. */ + src_frag_offset = head_len; + for (src_frags_left = src_shinfo->nr_frags, src_frag = &src_shinfo->frags[0]; + (src_frags_left > 0) && (length > 0); + src_frags_left--, src_frag_offset += skb_frag_size(src_frag), + src_frag++) { + if (offset >= (src_frag_offset + skb_frag_size(src_frag))) + continue; + chunk_size = skb_frag_size(src_frag) - (offset - src_frag_offset); + if (chunk_size > length) + chunk_size = length; + if (dst_shinfo->nr_frags == HOMA_MAX_SKB_FRAGS) + return -EINVAL; + dst_frag = &dst_shinfo->frags[dst_shinfo->nr_frags]; + dst_shinfo->nr_frags++; + frag_page_set(dst_frag, skb_frag_page(src_frag)); + get_page(skb_frag_page(src_frag)); + dst_frag->offset = src_frag->offset + + (offset - src_frag_offset); + skb_frag_size_set(dst_frag, chunk_size); + offset += chunk_size; + length -= chunk_size; + skb_len_add(dst_skb, chunk_size); + } + return 0; +} + +/** + * homa_skb_free_tx() - Release the storage for an sk_buff. + * @homa: Overall data about the Homa protocol implementation. + * @skb: sk_buff to free; should have been allocated by + * homa_skb_alloc_tx. + */ +void homa_skb_free_tx(struct homa *homa, struct sk_buff *skb) +{ + homa_skb_free_many_tx(homa, &skb, 1); +} + +/** + * homa_skb_free_many_tx() - Release the storage for multiple sk_buffs. + * @homa: Overall data about the Homa protocol implementation. + * @skbs: Pointer to first entry in array of sk_buffs to free. All of + * these should have been allocated by homa_skb_alloc_tx. + * @count: Total number of sk_buffs to free. + */ +void homa_skb_free_many_tx(struct homa *homa, struct sk_buff **skbs, int count) +{ +#ifdef __UNIT_TEST__ +#define MAX_PAGES_AT_ONCE 3 +#else +#define MAX_PAGES_AT_ONCE 50 +#endif + struct page *pages_to_cache[MAX_PAGES_AT_ONCE]; + u64 start = homa_clock(); + int num_pages = 0; + int i, j; + + for (i = 0; i < count; i++) { + struct skb_shared_info *shinfo; + struct sk_buff *skb = skbs[i]; + + shinfo = skb_shinfo(skb); + if (refcount_read(&skb->users) != 1) { + /* This sk_buff is still in use somewhere, so can't + * reclaim its pages. + */ + consume_skb(skb); + continue; + } + + /* Reclaim cacheable pages. */ + for (j = 0; j < shinfo->nr_frags; j++) { + struct page *page = skb_frag_page(&shinfo->frags[j]); + + if (compound_order(page) == HOMA_SKB_PAGE_ORDER && + page_ref_count(page) == 1) { + pages_to_cache[num_pages] = page; + num_pages++; + if (num_pages == MAX_PAGES_AT_ONCE) { + homa_skb_cache_pages(homa, pages_to_cache, + num_pages); + num_pages = 0; + } + } else { + put_page(page); + } + } + shinfo->nr_frags = 0; + consume_skb(skb); + } + if (num_pages > 0) + homa_skb_cache_pages(homa, pages_to_cache, num_pages); + INC_METRIC(skb_frees, count); + INC_METRIC(skb_free_cycles, homa_clock() - start); +} + +/** + * homa_skb_cache_pages() - Return pages to the global Homa cache of + * pages for sk_buffs. + * @homa: Overall data about the Homa protocol implementation. + * @pages: Array of pages to cache. + * @count: Number of pages in @count. + */ +void homa_skb_cache_pages(struct homa *homa, struct page **pages, int count) +{ +#ifdef __UNIT_TEST__ +#define LIMIT 4 +#else +#define LIMIT HOMA_PAGE_POOL_SIZE +#endif + int i; + + spin_lock_bh(&homa->page_pool_mutex); + for (i = 0; i < count; i++) { + struct page *page = pages[i]; + struct homa_page_pool *pool; + + pool = homa->page_pools[page_to_nid(page)]; + if (pool->avail < LIMIT) { + pool->pages[pool->avail] = page; + pool->avail++; + } else { + put_page(pages[i]); + } + } + spin_unlock_bh(&homa->page_pool_mutex); +} + +/** + * homa_skb_get() - Copy out part of the contents of a packet. + * @skb: sk_buff from which to copy data. + * @dest: Where to copy the data. + * @offset: Offset within skb of first byte to copy, measured + * relative to the transport header. + * @length: Total number of bytes to copy; will copy fewer bytes than + * this if the packet doesn't contain @length bytes at @offset. + */ +void homa_skb_get(struct sk_buff *skb, void *dest, int offset, int length) +{ + int chunk_size, frags_left, frag_offset, head_len; + struct skb_shared_info *shinfo = skb_shinfo(skb); + char *dst = dest; + skb_frag_t *frag; + + /* Copy bytes from the linear part of the skb, if any. */ + head_len = skb_tail_pointer(skb) - skb_transport_header(skb); + if (offset < head_len) { + chunk_size = length; + if (chunk_size > (head_len - offset)) + chunk_size = head_len - offset; + memcpy(dst, skb_transport_header(skb) + offset, chunk_size); + offset += chunk_size; + length -= chunk_size; + dst += chunk_size; + } + + frag_offset = head_len; + for (frags_left = shinfo->nr_frags, frag = &shinfo->frags[0]; + (frags_left > 0) && (length > 0); + frags_left--, + frag_offset += skb_frag_size(frag), frag++) { + if (offset >= (frag_offset + skb_frag_size(frag))) + continue; + chunk_size = skb_frag_size(frag) - (offset - frag_offset); + if (chunk_size > length) + chunk_size = length; + memcpy(dst, page_address(skb_frag_page(frag)) + frag->offset + + (offset - frag_offset), + chunk_size); + offset += chunk_size; + length -= chunk_size; + dst += chunk_size; + } +} + +/** + * homa_skb_release_pages() - This function is invoked occasionally; it's + * job is to gradually release pages from the sk_buff page pools back to + * Linux, based on sysctl parameters such as skb_page_frees_per_sec. + * @homa: Overall information about the Homa transport. + */ +void homa_skb_release_pages(struct homa *homa) +{ + int i, max_low_mark, min_pages, release, release_max; + struct homa_page_pool *max_pool; + u64 now = homa_clock(); + + if (now < homa->skb_page_free_time) + return; + + /* Free pages every 0.5 second. */ + homa->skb_page_free_time = now + (500 * homa_clock_khz()); + release_max = homa->skb_page_frees_per_sec / 2; + if (homa->pages_to_free_slots < release_max) { + struct page **old = homa->skb_pages_to_free; + + homa->skb_pages_to_free = kmalloc_array(release_max, + sizeof(struct page *), + GFP_ATOMIC); + if (homa->skb_pages_to_free) { + kfree(old); + homa->pages_to_free_slots = release_max; + } else { + homa->skb_pages_to_free = old; + release_max = homa->pages_to_free_slots; + } + } + + /* Find the pool with the largest number of pages that haven't + * been used recently. + */ + max_low_mark = -1; + spin_lock_bh(&homa->page_pool_mutex); + for (i = 0; i <= homa->max_numa; i++) { + struct homa_page_pool *pool = homa->page_pools[i]; + + if (!pool) + continue; + if (pool->low_mark > max_low_mark) { + max_low_mark = pool->low_mark; + max_pool = pool; + } + tt_record3("NUMA node %d has %d pages in skb page pool, low mark %d", + i, pool->avail, pool->low_mark); + pool->low_mark = pool->avail; + } + + /* Collect pages to free (but don't free them until after + * releasing the lock, since freeing is expensive). + */ + min_pages = ((homa->skb_page_pool_min_kb * 1000) + + (HOMA_SKB_PAGE_SIZE - 1)) / HOMA_SKB_PAGE_SIZE; + release = max_low_mark - min_pages; + if (release > release_max) + release = release_max; + for (i = 0; i < release; i++) { + max_pool->avail--; + homa->skb_pages_to_free[i] = max_pool->pages[max_pool->avail]; + } + max_pool->low_mark = max_pool->avail; + spin_unlock_bh(&homa->page_pool_mutex); + + /* Free the pages that were collected. */ + for (i = 0; i < release; i++) { + struct page *page = homa->skb_pages_to_free[i]; + + tt_record2("homa_skb_release_pages releasing page 0x%08x%08x", + tt_hi(page), tt_lo(page)); + put_page(page); + } +} diff --git a/homa_skb.h b/homa_skb.h new file mode 100644 index 00000000..f484fa23 --- /dev/null +++ b/homa_skb.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ + +/* This file contains definitions related to efficient management of + * memory associated with transmit sk_buffs. + */ + +#ifndef _HOMA_SKB_H +#define _HOMA_SKB_H + +#include + +/** + * define HOMA_SKB_PAGE_ORDER - exponent (power of two) determining how + * many pages to allocate in a high-order page for skb pages (e.g., + * 2 means allocate in units of 4 pages). + */ +#define HOMA_SKB_PAGE_ORDER 4 + +/** + * define HOMA_SKB_PAGE_SIZE - number of bytes corresponding to HOMA_PAGE_ORDER. + */ +#define HOMA_SKB_PAGE_SIZE (PAGE_SIZE << HOMA_SKB_PAGE_ORDER) + +/** + * struct homa_page_pool - A cache of free pages available for use in tx skbs. + * Each page is of size HOMA_SKB_PAGE_SIZE, and a pool is dedicated for + * use by a single NUMA node. Access to these objects is synchronized with + * @homa->page_pool_mutex. + */ +struct homa_page_pool { + /** @avail: Number of free pages currently in the pool. */ + int avail; + + /** + * @low_mark: Low water mark: smallest value of avail since the + * last time homa_skb_release_pages reset it. + */ + int low_mark; + +#define HOMA_PAGE_POOL_SIZE 1000 + + /** + * @pages: Pointers to pages that are currently free; the ref count + * is 1 in each of these pages. + */ + struct page *pages[HOMA_PAGE_POOL_SIZE]; +}; + +/** + * struct homa_skb_core - Stores core-specific information related to + * sk_buff allocation. All values are assumed to be zero initially. + */ +struct homa_skb_core { + /** + * @pool: NUMA-specific page pool from which to allocate skb pages + * for this core. + */ + struct homa_page_pool *pool; + + /** + * @skb_page: a page of data available being used for skb frags. + * This pointer is included in the page's reference count. + */ + struct page *skb_page; + + /** + * @page_inuse: offset of first byte in @skb_page that hasn't already + * been allocated. + */ + int page_inuse; + + /** @page_size: total number of bytes available in @skb_page. */ + int page_size; + + /* Maximum number of stashed pages that can be consumed by a message + * of a given size (assumes page_inuse is 0). This is a rough guess, + * since it doesn't consider all of the data_segments that will be + * needed for the packets. + */ +#define HOMA_MAX_STASHED(size) ((((size) - 1) / HOMA_SKB_PAGE_SIZE) + 1) + + /** + * @num_stashed_pages: number of pages currently available in + * stashed_pages. + */ + int num_stashed_pages; + + /** + * @stashed_pages: use to prefetch from the cache all of the pages a + * message will need with a single operation, to avoid having to + * synchronize separately for each page. Note: these pages are all + * HOMA_SKB_PAGE_SIZE in length. + */ + struct page *stashed_pages[HOMA_MAX_STASHED(HOMA_MAX_MESSAGE_LENGTH)]; +}; +DECLARE_PER_CPU(struct homa_skb_core, homa_skb_core); + +struct sk_buff *homa_skb_alloc_tx(int length); +int homa_skb_append_from_iter(struct homa *homa, + struct sk_buff *skb, struct iov_iter *iter, + int length); +int homa_skb_append_from_skb(struct homa *homa, + struct sk_buff *dst_skb, + struct sk_buff *src_skb, int offset, + int length); +int homa_skb_append_to_frag(struct homa *homa, struct sk_buff *skb, + void *buf, int length); +void homa_skb_cache_pages(struct homa *homa, struct page **pages, + int count); +void homa_skb_cleanup(struct homa *homa); +void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, + int *length); +void homa_skb_free_tx(struct homa *homa, struct sk_buff *skb); +void homa_skb_free_many_tx(struct homa *homa, struct sk_buff **skbs, + int count); +void homa_skb_get(struct sk_buff *skb, void *dest, int offset, + int length); +int homa_skb_init(struct homa *homa); +bool homa_skb_page_alloc(struct homa *homa, + struct homa_skb_core *core); +void homa_skb_release_pages(struct homa *homa); +void homa_skb_stash_pages(struct homa *homa, int length); + +#endif /* _HOMA_SKB_H */ diff --git a/homa_sock.c b/homa_sock.c new file mode 100644 index 00000000..0c5e97f6 --- /dev/null +++ b/homa_sock.c @@ -0,0 +1,533 @@ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +/* This file manages homa_sock and homa_socktab objects. */ + +#include "homa_impl.h" +#include "homa_interest.h" +#include "homa_peer.h" +#include "homa_pool.h" + +#ifndef __STRIP__ /* See strip.py */ +#include "homa_grant.h" +#endif /* See strip.py */ + +/** + * homa_socktab_init() - Constructor for homa_socktabs. + * @socktab: The object to initialize; previous contents are discarded. + */ +void homa_socktab_init(struct homa_socktab *socktab) +{ + int i; + + spin_lock_init(&socktab->write_lock); + for (i = 0; i < HOMA_SOCKTAB_BUCKETS; i++) + INIT_HLIST_HEAD(&socktab->buckets[i]); +} + +/** + * homa_socktab_destroy() - Destructor for homa_socktabs: deletes all + * existing sockets. + * @socktab: The object to destroy. + * @hnet: If non-NULL, only sockets for this namespace are deleted. + */ +void homa_socktab_destroy(struct homa_socktab *socktab, struct homa_net *hnet) +{ + struct homa_socktab_scan scan; + struct homa_sock *hsk; + + for (hsk = homa_socktab_start_scan(socktab, &scan); hsk; + hsk = homa_socktab_next(&scan)) { + if (hnet && hnet != hsk->hnet) + continue; + + /* In actual use there should be no sockets left when this + * function is invoked, so the code below will never be + * invoked. However, it is useful during unit tests. + */ + homa_sock_shutdown(hsk); + homa_sock_destroy(&hsk->sock); + } + homa_socktab_end_scan(&scan); +} + +/** + * homa_socktab_start_scan() - Begin an iteration over all of the sockets + * in a socktab. + * @socktab: Socktab to scan. + * @scan: Will hold the current state of the scan; any existing + * contents are discarded. The caller must eventually pass this + * to homa_socktab_end_scan. + * + * Return: The first socket in the table, or NULL if the table is + * empty. If non-NULL, a reference is held on the socket to + * prevent its deletion. + * + * Each call to homa_socktab_next will return the next socket in the table. + * All sockets that are present in the table at the time this function is + * invoked will eventually be returned, as long as they are not removed + * from the table. It is safe to remove sockets from the table while the + * scan is in progress. If a socket is removed from the table during the scan, + * it may or may not be returned by homa_socktab_next. New entries added + * during the scan may or may not be returned. + */ +struct homa_sock *homa_socktab_start_scan(struct homa_socktab *socktab, + struct homa_socktab_scan *scan) +{ + scan->socktab = socktab; + scan->hsk = NULL; + scan->current_bucket = -1; + + return homa_socktab_next(scan); +} + +/** + * homa_socktab_next() - Return the next socket in an iteration over a socktab. + * @scan: State of the scan. + * + * Return: The next socket in the table, or NULL if the iteration has + * returned all of the sockets in the table. If non-NULL, a + * reference is held on the socket to prevent its deletion. + * Sockets are not returned in any particular order. It's + * possible that the returned socket has been destroyed. + */ +struct homa_sock *homa_socktab_next(struct homa_socktab_scan *scan) +{ + struct hlist_head *bucket; + struct hlist_node *next; + + rcu_read_lock(); + if (scan->hsk) { + sock_put(&scan->hsk->sock); + next = rcu_dereference(hlist_next_rcu(&scan->hsk->socktab_links)); + if (next) + goto success; + } + for (scan->current_bucket++; + scan->current_bucket < HOMA_SOCKTAB_BUCKETS; + scan->current_bucket++) { + bucket = &scan->socktab->buckets[scan->current_bucket]; + next = rcu_dereference(hlist_first_rcu(bucket)); + if (next) + goto success; + } + scan->hsk = NULL; + rcu_read_unlock(); + return NULL; + +success: + scan->hsk = hlist_entry(next, struct homa_sock, socktab_links); + sock_hold(&scan->hsk->sock); + rcu_read_unlock(); + return scan->hsk; +} + +/** + * homa_socktab_end_scan() - Must be invoked on completion of each scan + * to clean up state associated with the scan. + * @scan: State of the scan. + */ +void homa_socktab_end_scan(struct homa_socktab_scan *scan) +{ + if (scan->hsk) { + sock_put(&scan->hsk->sock); + scan->hsk = NULL; + } +} + +/** + * homa_sock_init() - Constructor for homa_sock objects. This function + * initializes only the parts of the socket that are owned by Homa. + * @hsk: Object to initialize. The Homa-specific parts must have been + * initialized to zeroes by the caller. + * + * Return: 0 for success, otherwise a negative errno. + */ +int homa_sock_init(struct homa_sock *hsk) +{ + struct homa_pool *buffer_pool; + struct homa_socktab *socktab; + struct homa_sock *other; + struct homa_net *hnet; + struct homa *homa; + int starting_port; + int result = 0; + int i; + + hnet = (struct homa_net *)net_generic(sock_net(&hsk->sock), + homa_net_id); + homa = hnet->homa; + socktab = homa->socktab; + + /* Initialize fields outside the Homa part. */ + hsk->sock.sk_sndbuf = homa->wmem_max; + sock_set_flag(&hsk->inet.sk, SOCK_RCU_FREE); +#ifndef __STRIP__ /* See strip.py */ + if (homa->hijack_tcp) + hsk->sock.sk_protocol = IPPROTO_TCP; +#endif /* See strip.py */ + + /* Do things requiring memory allocation before locking the socket, + * so that GFP_ATOMIC is not needed. + */ + buffer_pool = homa_pool_alloc(hsk); + if (IS_ERR(buffer_pool)) + return PTR_ERR(buffer_pool); + + /* Initialize Homa-specific fields. We can initialize everything + * except the port and hash table links without acquiring the + * socket lock. + */ + hsk->homa = homa; + hsk->hnet = hnet; + hsk->buffer_pool = buffer_pool; + hsk->inet.inet_num = hsk->port; + hsk->inet.inet_sport = htons(hsk->port); + + hsk->is_server = false; + hsk->shutdown = false; + hsk->ip_header_length = (hsk->inet.sk.sk_family == AF_INET) ? + sizeof(struct iphdr) : sizeof(struct ipv6hdr); + spin_lock_init(&hsk->lock); + atomic_set(&hsk->protect_count, 0); + INIT_LIST_HEAD(&hsk->active_rpcs); + INIT_LIST_HEAD(&hsk->dead_rpcs); + hsk->dead_skbs = 0; + INIT_LIST_HEAD(&hsk->waiting_for_bufs); + INIT_LIST_HEAD(&hsk->ready_rpcs); + INIT_LIST_HEAD(&hsk->interests); + for (i = 0; i < HOMA_CLIENT_RPC_BUCKETS; i++) { + struct homa_rpc_bucket *bucket = &hsk->client_rpc_buckets[i]; + + spin_lock_init(&bucket->lock); + bucket->id = i; + INIT_HLIST_HEAD(&bucket->rpcs); + } + for (i = 0; i < HOMA_SERVER_RPC_BUCKETS; i++) { + struct homa_rpc_bucket *bucket = &hsk->server_rpc_buckets[i]; + + spin_lock_init(&bucket->lock); + bucket->id = i + 1000000; + INIT_HLIST_HEAD(&bucket->rpcs); + } + + /* Pick a default port. Must keep the socktab locked from now + * until the new socket is added to the socktab, to ensure that + * no other socket chooses the same port. + */ + spin_lock_bh(&socktab->write_lock); + starting_port = hnet->prev_default_port; + while (1) { + hnet->prev_default_port++; + if (hnet->prev_default_port < HOMA_MIN_DEFAULT_PORT) + hnet->prev_default_port = HOMA_MIN_DEFAULT_PORT; + other = homa_sock_find(hnet, hnet->prev_default_port); + if (!other) + break; + sock_put(&other->sock); + if (hnet->prev_default_port == starting_port) { + spin_unlock_bh(&socktab->write_lock); + hsk->shutdown = true; + hsk->homa = NULL; + result = -EADDRNOTAVAIL; + goto error; + } + spin_unlock_bh(&socktab->write_lock); + cond_resched(); + spin_lock_bh(&socktab->write_lock); + } + hsk->port = hnet->prev_default_port; + hlist_add_head_rcu(&hsk->socktab_links, + &socktab->buckets[homa_socktab_bucket(hnet, + hsk->port)]); + spin_unlock_bh(&socktab->write_lock); + return result; + +error: + homa_pool_free(buffer_pool); + return result; +} + +/* + * homa_sock_unlink() - Unlinks a socket from its socktab and does + * related cleanups. Once this method returns, the socket will not be + * discoverable through the socktab. + * @hsk: Socket to unlink. + */ +void homa_sock_unlink(struct homa_sock *hsk) +{ + struct homa_socktab *socktab = hsk->homa->socktab; + + spin_lock_bh(&socktab->write_lock); + hlist_del_rcu(&hsk->socktab_links); + spin_unlock_bh(&socktab->write_lock); +} + +/** + * homa_sock_shutdown() - Disable a socket so that it can no longer + * be used for either sending or receiving messages. Any system calls + * currently waiting to send or receive messages will be aborted. This + * function will terminate any existing use of the socket, but it does + * not free up socket resources: that happens in homa_sock_destroy. + * @hsk: Socket to shut down. + */ +void homa_sock_shutdown(struct homa_sock *hsk) +{ + struct homa_interest *interest; + struct homa_rpc *rpc; + + tt_record1("Starting shutdown for socket %d", hsk->port); + homa_sock_lock(hsk); + if (hsk->shutdown || !hsk->homa) { + homa_sock_unlock(hsk); + return; + } + + /* The order of cleanup is very important, because there could be + * active operations that hold RPC locks but not the socket lock. + * 1. Set @shutdown; this ensures that no new RPCs will be created for + * this socket (though some creations might already be in progress). + * 2. Remove the socket from its socktab: this ensures that + * incoming packets for the socket will be dropped. + * 3. Go through all of the RPCs and delete them; this will + * synchronize with any operations in progress. + * 4. Perform other socket cleanup: at this point we know that + * there will be no concurrent activities on individual RPCs. + * 5. Don't delete the buffer pool until after all of the RPCs + * have been reaped. + * See "Homa Locking Strategy" in homa_impl.h for additional information + * about locking. + */ + hsk->shutdown = true; + homa_sock_unlink(hsk); + homa_sock_unlock(hsk); + + rcu_read_lock(); + list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { + homa_rpc_lock(rpc); + homa_rpc_end(rpc); + homa_rpc_unlock(rpc); + } + rcu_read_unlock(); + + homa_sock_lock(hsk); + while (!list_empty(&hsk->interests)) { + interest = list_first_entry(&hsk->interests, + struct homa_interest, links); + list_del_init(&interest->links); + atomic_set_release(&interest->ready, 1); + wake_up(&interest->wait_queue); + } + homa_sock_unlock(hsk); + tt_record1("Finished shutdown for socket %d", hsk->port); +} + +/** + * homa_sock_destroy() - Release all of the internal resources associated + * with a socket; is invoked at time when that is safe (i.e., all references + * on the socket have been dropped). + * @sk: Socket to destroy. + */ +void homa_sock_destroy(struct sock *sk) +{ + struct homa_sock *hsk = homa_sk(sk); + + IF_NO_STRIP(int i = 0); + + if (!hsk->homa) + return; + + tt_record1("Starting to destroy socket %d", hsk->port); + while (!list_empty(&hsk->dead_rpcs)) { + homa_rpc_reap(hsk, true); +#ifndef __STRIP__ /* See strip.py */ + i++; + if (i == 5) { + tt_record("Freezing because reap seems hung"); + tt_freeze(); + } +#endif /* See strip.py */ + } + + WARN_ON_ONCE(refcount_read(&hsk->sock.sk_wmem_alloc) != 1); +#ifdef __UNIT_TEST__ + { + u64 tx_memory = refcount_read(&hsk->sock.sk_wmem_alloc); + + if (tx_memory != 1) + FAIL(" sk_wmem_alloc %llu after shutdown for port %d", + tx_memory, hsk->port); + } +#endif /* __UNIT_TEST__ */ + + if (hsk->buffer_pool) { + homa_pool_free(hsk->buffer_pool); + hsk->buffer_pool = NULL; + } + tt_record1("Finished destroying socket %d", hsk->port); +} + +/** + * homa_sock_bind() - Associates a server port with a socket; if there + * was a previous server port assignment for @hsk, it is abandoned. + * @hnet: Network namespace with which port is associated. + * @hsk: Homa socket. + * @port: Desired server port for @hsk. If 0, then this call + * becomes a no-op: the socket will continue to use + * its randomly assigned client port. + * + * Return: 0 for success, otherwise a negative errno. If an error is + * returned, hsk->error_msg is set. + */ +int homa_sock_bind(struct homa_net *hnet, struct homa_sock *hsk, + u16 port) +{ + struct homa_socktab *socktab = hnet->homa->socktab; + struct homa_sock *owner; + int result = 0; + + if (port == 0) + return result; + if (port >= HOMA_MIN_DEFAULT_PORT) { + hsk->error_msg = "port number invalid: in the automatically assigned range"; + return -EINVAL; + } + homa_sock_lock(hsk); + spin_lock_bh(&socktab->write_lock); + if (hsk->shutdown) { + hsk->error_msg = "socket has been shut down"; + result = -ESHUTDOWN; + goto done; + } + + owner = homa_sock_find(hnet, port); + if (owner) { + sock_put(&owner->sock); + if (owner != hsk) { + hsk->error_msg = "requested port number is already in use"; + result = -EADDRINUSE; + } + goto done; + } + hlist_del_rcu(&hsk->socktab_links); + hsk->port = port; + hsk->inet.inet_num = port; + hsk->inet.inet_sport = htons(hsk->port); + hlist_add_head_rcu(&hsk->socktab_links, + &socktab->buckets[homa_socktab_bucket(hnet, port)]); + hsk->is_server = true; +done: + spin_unlock_bh(&socktab->write_lock); + homa_sock_unlock(hsk); + return result; +} + +/** + * homa_sock_find() - Returns the socket associated with a given port. + * @hnet: Network namespace where the socket will be used. + * @port: The port of interest. + * Return: The socket that owns @port, or NULL if none. If non-NULL + * then this method has taken a reference on the socket and + * the caller must call sock_put to release it. + */ +struct homa_sock *homa_sock_find(struct homa_net *hnet, u16 port) +{ + int bucket = homa_socktab_bucket(hnet, port); + struct homa_sock *result = NULL; + struct homa_sock *hsk; + + rcu_read_lock(); + hlist_for_each_entry_rcu(hsk, &hnet->homa->socktab->buckets[bucket], + socktab_links) { + if (hsk->port == port && hsk->hnet == hnet) { + result = hsk; + sock_hold(&hsk->sock); + break; + } + } + rcu_read_unlock(); + return result; +} + +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_sock_lock_slow() - This function implements the slow path for + * acquiring a socketC lock. It is invoked when a socket lock isn't immediately + * available. It waits for the lock, but also records statistics about + * the waiting time. + * @hsk: socket to lock. + */ +void homa_sock_lock_slow(struct homa_sock *hsk) + __acquires(hsk->lock) +{ + u64 start = homa_clock(); + + tt_record("beginning wait for socket lock"); + spin_lock_bh(&hsk->lock); + tt_record("ending wait for socket lock"); + INC_METRIC(socket_lock_misses, 1); + INC_METRIC(socket_lock_miss_cycles, homa_clock() - start); +} + +/** + * homa_bucket_lock_slow() - This function implements the slow path for + * locking a bucket in one of the hash tables of RPCs. It is invoked when a + * lock isn't immediately available. It waits for the lock, but also records + * statistics about the waiting time. + * @bucket: The hash table bucket to lock. + * @id: Id of the RPC on whose behalf the bucket is being locked. + * Used only for metrics. + */ +void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, u64 id) + __acquires(bucket->lock) +{ + u64 start = homa_clock(); + + tt_record2("beginning wait for rpc lock, id %d, (bucket %d)", + id, bucket->id); + spin_lock_bh(&bucket->lock); + tt_record2("ending wait for bucket lock, id %d, (bucket %d)", + id, bucket->id); + if (homa_is_client(id)) { + INC_METRIC(client_lock_misses, 1); + INC_METRIC(client_lock_miss_cycles, homa_clock() - start); + } else { + INC_METRIC(server_lock_misses, 1); + INC_METRIC(server_lock_miss_cycles, homa_clock() - start); + } +} +#endif /* See strip.py */ + +/** + * homa_sock_wait_wmem() - Block the thread until @hsk's usage of tx + * packet memory drops below the socket's limit. + * @hsk: Socket of interest. + * @nonblocking: If there's not enough memory, return -EWOLDBLOCK instead + * of blocking. + * Return: 0 for success, otherwise a negative errno. + */ +int homa_sock_wait_wmem(struct homa_sock *hsk, int nonblocking) +{ + long timeo = hsk->sock.sk_sndtimeo; + int result; + + /* Note: we can't use sock_wait_for_wmem because that function + * is not available to modules (as of August 2025 it's static). + */ + + if (nonblocking) + timeo = 0; + set_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags); + tt_record2("homa_sock_wait_wmem waiting on port %d, wmem %d", + hsk->port, refcount_read(&hsk->sock.sk_wmem_alloc)); + result = wait_event_interruptible_timeout(*sk_sleep(&hsk->sock), + homa_sock_wmem_avl(hsk) || hsk->shutdown, + timeo); + tt_record4("homa_sock_wait_wmem woke up on port %d with result %d, wmem %d, signal pending %d", + hsk->port, result, refcount_read(&hsk->sock.sk_wmem_alloc), + signal_pending(current)); + if (signal_pending(current)) + return -EINTR; + if (result == 0) + return -EWOULDBLOCK; + return 0; +} diff --git a/homa_sock.h b/homa_sock.h new file mode 100644 index 00000000..9b433bb2 --- /dev/null +++ b/homa_sock.h @@ -0,0 +1,464 @@ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ + +/* This file defines structs and other things related to Homa sockets. */ + +#ifndef _HOMA_SOCK_H +#define _HOMA_SOCK_H + +/* Forward declarations. */ +struct homa; +struct homa_pool; + +/* Number of hash buckets in a homa_socktab. Must be a power of 2. */ +#define HOMA_SOCKTAB_BUCKET_BITS 10 +#define HOMA_SOCKTAB_BUCKETS BIT(HOMA_SOCKTAB_BUCKET_BITS) + +/** + * struct homa_socktab - A hash table that maps from port numbers (either + * client or server) to homa_sock objects. + * + * This table is managed exclusively by homa_socktab.c, using RCU to + * minimize synchronization during lookups. + */ +struct homa_socktab { + /** + * @write_lock: Controls all modifications to this object; not needed + * for socket lookups (RCU is used instead). Also used to + * synchronize port allocation. + */ + spinlock_t write_lock; + + /** + * @buckets: Heads of chains for hash table buckets. Chains + * consist of homa_sock objects. + */ + struct hlist_head buckets[HOMA_SOCKTAB_BUCKETS]; +}; + +/** + * struct homa_socktab_scan - Records the state of an iteration over all + * the entries in a homa_socktab, in a way that is safe against concurrent + * reclamation of sockets. + */ +struct homa_socktab_scan { + /** @socktab: The table that is being scanned. */ + struct homa_socktab *socktab; + + /** + * @hsk: Points to the current socket in the iteration, or NULL if + * we're at the beginning or end of the iteration. If non-NULL then + * we are holding a reference to this socket. + */ + struct homa_sock *hsk; + + /** + * @current_bucket: The index of the bucket in socktab->buckets + * currently being scanned (-1 if @hsk == NULL). + */ + int current_bucket; +}; + +/** + * struct homa_rpc_bucket - One bucket in a hash table of RPCs. + */ + +struct homa_rpc_bucket { + /** + * @lock: serves as a lock both for this bucket (e.g., when + * adding and removing RPCs) and also for all of the RPCs in + * the bucket. Must be held whenever looking up an RPC in + * this bucket or manipulating an RPC in the bucket. This approach + * has the following properties: + * 1. An RPC can be looked up and locked (a common operation) with + * a single lock acquisition. + * 2. Looking up and locking are atomic: there is no window of + * vulnerability where someone else could delete an RPC after + * it has been looked up and before it has been locked. + * 3. The lookup mechanism does not use RCU. This is important because + * RPCs are created rapidly and typically live only a few tens of + * microseconds. As of May 2025 RCU introduces a lag of about + * 25 ms before objects can be deleted; for RPCs this would result + * in hundreds or thousands of RPCs accumulating before RCU allows + * them to be deleted. + * This approach has the disadvantage that RPCs within a bucket share + * locks and thus may not be able to work concurrently, but there are + * enough buckets in the table to make such colllisions rare. + * + * See "Homa Locking Strategy" in homa_impl.h for more info about + * locking. + */ + spinlock_t lock; + + /** + * @id: identifier for this bucket, used in error messages etc. + * It's the index of the bucket within its hash table bucket + * array, with an additional offset to separate server and + * client RPCs. + */ + int id; + + /** @rpcs: list of RPCs that hash to this bucket. */ + struct hlist_head rpcs; +}; + +/** + * define HOMA_CLIENT_RPC_BUCKETS - Number of buckets in hash tables for + * client RPCs. Must be a power of 2. + */ +#define HOMA_CLIENT_RPC_BUCKETS 1024 + +/** + * define HOMA_SERVER_RPC_BUCKETS - Number of buckets in hash tables for + * server RPCs. Must be a power of 2. + */ +#define HOMA_SERVER_RPC_BUCKETS 1024 + +/** + * struct homa_sock - Information about an open socket. + */ +struct homa_sock { + /* Info for other network layers. Note: IPv6 info (struct ipv6_pinfo + * comes at the very end of the struct, *after* Homa's data, if this + * socket uses IPv6). + */ + union { + /** @sock: generic socket data; must be the first field. */ + struct sock sock; + + /** + * @inet: generic Internet socket data; must also be the + first field (contains sock as its first member). + */ + struct inet_sock inet; + }; + + /** + * @homa: Overall state about the Homa implementation. NULL + * means this socket was never initialized or has been deleted. + */ + struct homa *homa; + + /** + * @hnet: Overall state specific to the network namespace for + * this socket. + */ + struct homa_net *hnet; + + /** + * @buffer_pool: used to allocate buffer space for incoming messages. + * Storage is dynamically allocated. + */ + struct homa_pool *buffer_pool; + + /** + * @port: Port number: identifies this socket uniquely among all + * those on this node. + */ + u16 port; + + /** + * @is_server: True means that this socket can act as both client + * and server; false means the socket is client-only. + */ + bool is_server; + + /** + * @shutdown: True means the socket is no longer usable (either + * shutdown has already been invoked, or the socket was never + * properly initialized). Note: can't use the SOCK_DEAD flag for + * this because that flag doesn't get set until much later in the + * process of closing a socket. + */ + bool shutdown; + + /** + * @ip_header_length: Length of IP headers for this socket (depends + * on IPv4 vs. IPv6). + */ + int ip_header_length; + + /** @socktab_links: Links this socket into a homa_socktab bucket. */ + struct hlist_node socktab_links; + + /** + * @error_msg: Static string giving human-readable information about + * the reason for the last error returned by a Homa kernel call. + * Applications can fetch this with the HOMAIOCINFO ioctl to figure + * out why a call failed. + */ + char *error_msg; + + /* Information above is (almost) never modified; start a new + * cache line below for info that is modified frequently. + */ + + /** + * @lock: Must be held when modifying fields such as interests + * and lists of RPCs. This lock is used in place of sk->sk_lock + * because it's used differently (it's always used as a simple + * spin lock). See "Homa Locking Strategy" in homa_impl.h + * for more on Homa's synchronization strategy. + */ + spinlock_t lock ____cacheline_aligned_in_smp; + + /** + * @protect_count: counts the number of calls to homa_protect_rpcs + * for which there have not yet been calls to homa_unprotect_rpcs. + */ + atomic_t protect_count; + + /** + * @active_rpcs: List of all existing RPCs related to this socket, + * including both client and server RPCs. This list isn't strictly + * needed, since RPCs are already in one of the hash tables below, + * but it's more efficient for homa_timer to have this list + * (so it doesn't have to scan large numbers of hash buckets). + * The list is sorted, with the oldest RPC first. Manipulate with + * RCU so timer can access without locking. + */ + struct list_head active_rpcs; + + /** + * @dead_rpcs: Contains RPCs for which homa_rpc_end has been + * called, but which have not yet been reaped by homa_rpc_reap. + */ + struct list_head dead_rpcs; + + /** @dead_skbs: Total number of socket buffers in RPCs on dead_rpcs. */ + int dead_skbs; + + /** + * @waiting_for_bufs: Contains RPCs that are blocked because there + * wasn't enough space in the buffer pool region for their incoming + * messages. Sorted in increasing order of message length. + */ + struct list_head waiting_for_bufs; + + /** + * @ready_rpcs: List of all RPCs that are ready for attention from + * an application thread. + */ + struct list_head ready_rpcs; + + /** + * @interests: List of threads that are currently waiting for + * incoming messages via homa_wait_shared. + */ + struct list_head interests; + + /** + * @client_rpc_buckets: Hash table for fast lookup of client RPCs. + * Modifications are synchronized with bucket locks, not + * the socket lock. + */ + struct homa_rpc_bucket client_rpc_buckets[HOMA_CLIENT_RPC_BUCKETS]; + + /** + * @server_rpc_buckets: Hash table for fast lookup of server RPCs. + * Modifications are synchronized with bucket locks, not + * the socket lock. + */ + struct homa_rpc_bucket server_rpc_buckets[HOMA_SERVER_RPC_BUCKETS]; +}; + +/** + * struct homa_v6_sock - For IPv6, additional IPv6-specific information + * is present in the socket struct after Homa-specific information. + */ +struct homa_v6_sock { + /** @homa: All socket info except for IPv6-specific stuff. */ + struct homa_sock homa; + + /** @inet6: Socket info specific to IPv6. */ + struct ipv6_pinfo inet6; +}; + +#ifndef __STRIP__ /* See strip.py */ +void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, + u64 id); +void homa_sock_lock_slow(struct homa_sock *hsk); +#endif /* See strip.py */ +int homa_sock_bind(struct homa_net *hnet, struct homa_sock *hsk, + u16 port); +void homa_sock_destroy(struct sock *sk); +struct homa_sock *homa_sock_find(struct homa_net *hnet, u16 port); +int homa_sock_init(struct homa_sock *hsk); +void homa_sock_shutdown(struct homa_sock *hsk); +void homa_sock_unlink(struct homa_sock *hsk); +int homa_sock_wait_wmem(struct homa_sock *hsk, int nonblocking); +void homa_socktab_destroy(struct homa_socktab *socktab, + struct homa_net *hnet); +void homa_socktab_end_scan(struct homa_socktab_scan *scan); +void homa_socktab_init(struct homa_socktab *socktab); +struct homa_sock *homa_socktab_next(struct homa_socktab_scan *scan); +struct homa_sock *homa_socktab_start_scan(struct homa_socktab *socktab, + struct homa_socktab_scan *scan); + +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_sock_lock() - Acquire the lock for a socket. If the socket + * isn't immediately available, record stats on the waiting time. + * @hsk: Socket to lock. + */ +static inline void homa_sock_lock(struct homa_sock *hsk) + __acquires(hsk->lock) +{ + if (!spin_trylock_bh(&hsk->lock)) + homa_sock_lock_slow(hsk); +} +#else /* See strip.py */ +/** + * homa_sock_lock() - Acquire the lock for a socket. + * @hsk: Socket to lock. + */ +static inline void homa_sock_lock(struct homa_sock *hsk) + __acquires(hsk->lock) +{ + spin_lock_bh(&hsk->lock); +} +#endif /* See strip.py */ + +/** + * homa_sock_unlock() - Release the lock for a socket. + * @hsk: Socket to lock. + */ +static inline void homa_sock_unlock(struct homa_sock *hsk) + __releases(hsk->lock) +{ + spin_unlock_bh(&hsk->lock); +} + +/** + * homa_socktab_bucket() - Compute the bucket number in a homa_socktab + * that will contain a particular socket. + * @hnet: Network namespace of the desired socket. + * @port: Port number of the socket. + * + * Return: The index of the bucket in which a socket matching @hnet and + * @port will be found (if it exists). + */ +static inline int homa_socktab_bucket(struct homa_net *hnet, u16 port) +{ +#ifdef __UNIT_TEST__ + return port & (HOMA_SOCKTAB_BUCKETS - 1); +#else /* __UNIT_TEST__ */ + return hash_32((uintptr_t)hnet ^ port, HOMA_SOCKTAB_BUCKET_BITS); +#endif /* __UNIT_TEST__ */ +} + +/** + * homa_client_rpc_bucket() - Find the bucket containing a given + * client RPC. + * @hsk: Socket associated with the RPC. + * @id: Id of the desired RPC. + * + * Return: The bucket in which this RPC will appear, if the RPC exists. + */ +static inline struct homa_rpc_bucket + *homa_client_rpc_bucket(struct homa_sock *hsk, u64 id) +{ + /* We can use a really simple hash function here because RPC ids + * are allocated sequentially. + */ + return &hsk->client_rpc_buckets[(id >> 1) & + (HOMA_CLIENT_RPC_BUCKETS - 1)]; +} + +/** + * homa_server_rpc_bucket() - Find the bucket containing a given + * server RPC. + * @hsk: Socket associated with the RPC. + * @id: Id of the desired RPC. + * + * Return: The bucket in which this RPC will appear, if the RPC exists. + */ +static inline struct homa_rpc_bucket + *homa_server_rpc_bucket(struct homa_sock *hsk, u64 id) +{ + /* Each client allocates RPC ids sequentially, so they will + * naturally distribute themselves across the hash space. + * Thus we can use the id directly as hash. + */ + return &hsk->server_rpc_buckets[(id >> 1) + & (HOMA_SERVER_RPC_BUCKETS - 1)]; +} + +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_bucket_lock() - Acquire the lock for an RPC hash table bucket. + * @bucket: Bucket to lock. + * @id: Id of the RPC on whose behalf the bucket is being locked. + * Used only for metrics. + */ +static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, u64 id) + __acquires(bucket->lock) +{ + if (!spin_trylock_bh(&bucket->lock)) + homa_bucket_lock_slow(bucket, id); +} +#else /* See strip.py */ +/** + * homa_bucket_lock() - Acquire the lock for an RPC hash table bucket. + * @bucket: Bucket to lock. + * @id: Id of the RPC on whose behalf the bucket is being locked. + * Used only for metrics. + */ +static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, u64 id) + __acquires(bucket->lock) +{ + spin_lock_bh(&bucket->lock); +} +#endif /* See strip.py */ + +/** + * homa_bucket_unlock() - Release the lock for an RPC hash table bucket. + * @bucket: Bucket to unlock. + * @id: ID of the RPC that was using the lock. + */ +static inline void homa_bucket_unlock(struct homa_rpc_bucket *bucket, u64 id) + __releases(bucket->lock) +{ + spin_unlock_bh(&bucket->lock); +} + +static inline struct homa_sock *homa_sk(const struct sock *sk) +{ + return (struct homa_sock *)sk; +} + +/** + * homa_sock_wmem_avl() - Returns true if the socket is within its limit + * for output memory usage. False means that no new messages should be sent + * until memory is freed. + * @hsk: Socket of interest. + * Return: See above. + */ +static inline bool homa_sock_wmem_avl(struct homa_sock *hsk) +{ + return refcount_read(&hsk->sock.sk_wmem_alloc) < hsk->sock.sk_sndbuf; +} + +/** + * homa_sock_wakeup_wmem() - Invoked when tx packet memory has been freed; + * if memory usage is below the limit and there are tasks waiting for memory, + * wake them up. + * @hsk: Socket of interest. + */ +static inline void homa_sock_wakeup_wmem(struct homa_sock *hsk) +{ + /* Note: can't use sk_stream_write_space for this functionality + * because it uses a different test to determine whether enough + * memory is available. + */ + if (test_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags) && + homa_sock_wmem_avl(hsk)) { + tt_record2("homa_sock_wakeup_wmem waking up port %d, wmem %d", + hsk->port, refcount_read(&hsk->sock.sk_wmem_alloc)); + clear_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags); + rcu_read_lock(); + wake_up_interruptible_poll(sk_sleep(&hsk->sock), EPOLLOUT); + rcu_read_unlock(); + } +} + +#endif /* _HOMA_SOCK_H */ diff --git a/homa_socktab.c b/homa_socktab.c deleted file mode 100644 index 4c9f5ed0..00000000 --- a/homa_socktab.c +++ /dev/null @@ -1,328 +0,0 @@ -/* Copyright (c) 2019, Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -/* This file manages homa_socktab objects; it also implements several - * operations on homa_sock objects, such as construction and destruction. - */ - -#include "homa_impl.h" - -/** - * homa_socktab_init() - Constructor for homa_socktabs. - * @socktab: The object to initialize; previous contents are discarded. - */ -void homa_socktab_init(struct homa_socktab *socktab) -{ - int i; - spin_lock_init(&socktab->write_lock); - for (i = 0; i < HOMA_SOCKTAB_BUCKETS; i++) { - INIT_HLIST_HEAD(&socktab->buckets[i]); - } -} - -/** - * homa_socktab_destroy() - Destructor for homa_socktabs. - * @socktab: The object to destroy. - */ -void homa_socktab_destroy(struct homa_socktab *socktab) -{ - struct homa_socktab_scan scan; - struct homa_sock *hsk; - - for (hsk = homa_socktab_start_scan(socktab, &scan); hsk != NULL; - hsk = homa_socktab_next(&scan)) { - homa_sock_destroy(hsk); - } -} - -/** - * homa_socktab_start_scan() - Begin an iteration over all of the sockets - * in a socktab. - * @socktab: Socktab to scan. - * @scan: Will hold the current state of the scan; any existing - * contents are discarded. - * - * Return: The first socket in the table, or NULL if the table is - * empty. - * - * Each call to homa_socktab_next will return the next socket in the table. - * All sockets that are present in the table at the time this function is - * invoked will eventually be returned, as long as they are not removed - * from the table. It is safe to remove sockets from the table and/or - * delete them while the scan is in progress. If a socket is removed from - * the table during the scan, it may or may not be returned by - * homa_socktab_next. New entries added during the scan may or may not be - * returned. The caller should use RCU to prevent socket storage from - * being reclaimed during the scan. - */ -struct homa_sock *homa_socktab_start_scan(struct homa_socktab *socktab, - struct homa_socktab_scan *scan) -{ - scan->socktab = socktab; - scan->current_bucket = -1; - scan->next = NULL; - return homa_socktab_next(scan); -} - -/** - * homa_starttab_next() - Return the next socket in an iteration over a socktab. - * @scan: State of the scan. - * - * Return: The next socket in the table, or NULL if the iteration has - * returned all of the sockets in the table. Sockets are not - * returned in any particular order. It's possible that the - * returned socket has been destroyed. - */ -struct homa_sock *homa_socktab_next(struct homa_socktab_scan *scan) -{ - struct homa_sock *hsk; - struct homa_socktab_links *links; - while (1) { - while (scan->next == NULL) { - scan->current_bucket++; - if (scan->current_bucket >= HOMA_SOCKTAB_BUCKETS) - return NULL; - scan->next = (struct homa_socktab_links *) - hlist_first_rcu( - &scan->socktab->buckets[scan->current_bucket]); - } - links = scan->next; - hsk = links->sock; - scan->next = (struct homa_socktab_links *) hlist_next_rcu( - &links->hash_links); - return hsk; - } -} - -/** - * homa_sock_init() - Constructor for homa_sock objects. This function - * initializes only the parts of the socket that are owned by Homa. - * @hsk: Object to initialize. - * @homa: Homa implementation that will manage the socket. - * - * Return: always 0 (success). - */ -void homa_sock_init(struct homa_sock *hsk, struct homa *homa) -{ - struct homa_socktab *socktab = &homa->port_map; - int i; - - spin_lock_bh(&socktab->write_lock); - atomic_set(&hsk->protect_count, 0); - spin_lock_init(&hsk->lock); - hsk->last_locker = "none"; - atomic_set(&hsk->protect_count, 0); - hsk->homa = homa; - hsk->ip_header_length = (hsk->inet.sk.sk_family == AF_INET) - ? HOMA_IPV4_HEADER_LENGTH : HOMA_IPV6_HEADER_LENGTH; - hsk->shutdown = false; - while (1) { - if (homa->next_client_port < HOMA_MIN_DEFAULT_PORT) { - homa->next_client_port = HOMA_MIN_DEFAULT_PORT; - } - if (!homa_sock_find(socktab, homa->next_client_port)) { - break; - } - homa->next_client_port++; - } - hsk->port = homa->next_client_port; - hsk->inet.inet_num = hsk->port; - hsk->inet.inet_sport = htons(hsk->port); - homa->next_client_port++; - hsk->socktab_links.sock = hsk; - hlist_add_head_rcu(&hsk->socktab_links.hash_links, - &socktab->buckets[homa_port_hash(hsk->port)]); - INIT_LIST_HEAD(&hsk->active_rpcs); - INIT_LIST_HEAD(&hsk->dead_rpcs); - hsk->dead_skbs = 0; - INIT_LIST_HEAD(&hsk->ready_requests); - INIT_LIST_HEAD(&hsk->ready_responses); - INIT_LIST_HEAD(&hsk->request_interests); - INIT_LIST_HEAD(&hsk->response_interests); - for (i = 0; i < HOMA_CLIENT_RPC_BUCKETS; i++) { - struct homa_rpc_bucket *bucket = &hsk->client_rpc_buckets[i]; - spin_lock_init(&bucket->lock); - INIT_HLIST_HEAD(&bucket->rpcs); - } - for (i = 0; i < HOMA_SERVER_RPC_BUCKETS; i++) { - struct homa_rpc_bucket *bucket = &hsk->server_rpc_buckets[i]; - spin_lock_init(&bucket->lock); - INIT_HLIST_HEAD(&bucket->rpcs); - } - memset(&hsk->buffer_pool, 0, sizeof(hsk->buffer_pool)); - spin_unlock_bh(&socktab->write_lock); -} - -/** - * homa_sock_shutdown() - Disable a socket so that it can no longer - * be used for either sending or receiving messages. Any system calls - * currently waiting to send or receive messages will be aborted. - * @hsk: Socket to shut down. - */ -void homa_sock_shutdown(struct homa_sock *hsk) -{ - struct homa_interest *interest; - struct homa_rpc *rpc; - int i; - - homa_sock_lock(hsk, "homa_socket_shutdown"); - if (hsk->shutdown) { - homa_sock_unlock(hsk); - return; - } - - /* The order of cleanup is very important, because there could be - * active operations that hold RPC locks but not the socket lock. - * 1. Set @shutdown; this ensures that no new RPCs will be created for - * this socket (though some creations might already be in progress). - * 2. Remove the socket from the port map: this ensures that - * incoming packets for the socket will be dropped. - * 3. Go through all of the RPCs and delete them; this will - * synchronize with any operations in progress. - * 4. Perform other socket cleanup: at this point we know that - * there will be no concurrent activities on individual RPCs. - * See sync.txt for additional information about locking. - */ - hsk->shutdown = true; - spin_lock_bh(&hsk->homa->port_map.write_lock); - hlist_del_rcu(&hsk->socktab_links.hash_links); - spin_unlock_bh(&hsk->homa->port_map.write_lock); - homa_sock_unlock(hsk); - - list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { - homa_rpc_lock(rpc); - homa_rpc_free(rpc); - homa_rpc_unlock(rpc); - } - - homa_sock_lock(hsk, "homa_socket_shutdown #2"); - list_for_each_entry(interest, &hsk->request_interests, request_links) - wake_up_process(interest->thread); - list_for_each_entry(interest, &hsk->response_interests, response_links) - wake_up_process(interest->thread); - homa_sock_unlock(hsk); - - homa_pool_destroy(&hsk->buffer_pool); - - i = 0; - while (!list_empty(&hsk->dead_rpcs)) { - homa_rpc_reap(hsk, 1000); - i++; - if (i == 5) { - tt_record("Freezing because reap seems hung"); - tt_freeze(); - } - } -} - -/** - * homa_sock_destroy() - Destructor for homa_sock objects. This function - * only cleans up the parts of the object that are owned by Homa. - * @hsk: Socket to destroy. - */ -void homa_sock_destroy(struct homa_sock *hsk) -{ - homa_sock_shutdown(hsk); - sock_set_flag(&hsk->inet.sk, SOCK_RCU_FREE); -} - -/** - * homa_sock_bind() - Associates a server port with a socket; if there - * was a previous server port assignment for @hsk, it is abandoned. - * @socktab: Hash table in which the binding will be recorded. - * @hsk: Homa socket. - * @port: Desired server port for @hsk. If 0, then this call - * becomes a no-op: the socket will continue to use - * its randomly assigned client port. - * - * Return: 0 for success, otherwise a negative errno. - */ -int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, - __u16 port) -{ - int result = 0; - struct homa_sock *owner; - - if (port == 0) - return result; - if (port >= HOMA_MIN_DEFAULT_PORT) { - return -EINVAL; - } - homa_sock_lock(hsk, "homa_sock_bind"); - spin_lock_bh(&socktab->write_lock); - if (hsk->shutdown) { - result = -ESHUTDOWN; - goto done; - } - - owner = homa_sock_find(socktab, port); - if (owner != NULL) { - if (owner != hsk) - result = -EADDRINUSE; - goto done; - } - hlist_del_rcu(&hsk->socktab_links.hash_links); - hsk->port = port; - hsk->inet.inet_num = port; - hsk->inet.inet_sport = htons(hsk->port); - hlist_add_head_rcu(&hsk->socktab_links.hash_links, - &socktab->buckets[homa_port_hash(port)]); - done: - spin_unlock_bh(&socktab->write_lock); - homa_sock_unlock(hsk); - return result; -} - -/** - * homa_sock_find() - Returns the socket associated with a given port. - * @socktab: Hash table in which to perform lookup. - * @port: The port of interest. - * Return: The socket that owns @port, or NULL if none. - * - * Note: this function uses RCU list-searching facilities, but it doesn't - * call rcu_read_lock. The caller should do that, if the caller cares (this - * way, the caller's use of the socket will also be protected). - */ -struct homa_sock *homa_sock_find(struct homa_socktab *socktab, __u16 port) -{ - struct homa_socktab_links *link; - struct homa_sock *result = NULL; - hlist_for_each_entry_rcu(link, &socktab->buckets[homa_port_hash(port)], - hash_links) { - struct homa_sock *hsk = link->sock; - if (hsk->port == port) { - result = hsk; - break; - } - } - return result; -} - -/** - * homa_sock_lock_slow() - This function implements the slow path for - * acquiring a socketC lock. It is invoked when a socket lock isn't immediately - * available. It waits for the lock, but also records statistics about - * the waiting time. - * @hsk: socket to lock. - */ -void homa_sock_lock_slow(struct homa_sock *hsk) -{ - __u64 start = get_cycles(); - tt_record("beginning wait for socket lock"); - spin_lock_bh(&hsk->lock); - tt_record("ending wait for socket lock"); - INC_METRIC(socket_lock_misses, 1); - INC_METRIC(socket_lock_miss_cycles, get_cycles() - start); -} diff --git a/homa_stub.h b/homa_stub.h new file mode 100644 index 00000000..502cd93d --- /dev/null +++ b/homa_stub.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ + +/* This file contains stripped-down replacements that have been + * temporarily removed from Homa during the Linux upstreaming + * process. By the time upstreaming is complete this file will + * have gone away. + */ + +#ifndef _HOMA_STUB_H +#define _HOMA_STUB_H + +#include "homa_impl.h" + +static inline int homa_skb_init(struct homa *homa) +{ + return 0; +} + +static inline void homa_skb_cleanup(struct homa *homa) +{} + +static inline void homa_skb_release_pages(struct homa *homa) +{} + +static inline int homa_skb_append_from_iter(struct homa *homa, + struct sk_buff *skb, + struct iov_iter *iter, int length) +{ + char *dst = skb_put(skb, length); + + if (copy_from_iter(dst, length, iter) != length) + return -EFAULT; + return 0; +} + +static inline int homa_skb_append_to_frag(struct homa *homa, + struct sk_buff *skb, void *buf, + int length) +{ + char *dst = skb_put(skb, length); + + memcpy(dst, buf, length); + return 0; +} + +static inline int homa_skb_append_from_skb(struct homa *homa, + struct sk_buff *dst_skb, + struct sk_buff *src_skb, + int offset, int length) +{ + return homa_skb_append_to_frag(homa, dst_skb, + skb_transport_header(src_skb) + offset, length); +} + +static inline void homa_skb_free_tx(struct homa *homa, struct sk_buff *skb) +{ + consume_skb(skb); +} + +static inline void homa_skb_free_many_tx(struct homa *homa, + struct sk_buff **skbs, int count) +{ + int i; + + for (i = 0; i < count; i++) + consume_skb(skbs[i]); +} + +static inline void homa_skb_get(struct sk_buff *skb, void *dest, int offset, + int length) +{ + memcpy(dest, skb_transport_header(skb) + offset, length); +} + +static inline struct sk_buff *homa_skb_alloc_tx(int length) +{ + struct sk_buff *skb; + + skb = alloc_skb(HOMA_SKB_EXTRA + sizeof(struct homa_skb_info) + length, + GFP_ATOMIC); + if (likely(skb)) { + skb_reserve(skb, HOMA_SKB_EXTRA); + skb_reset_transport_header(skb); + } + return skb; +} + +static inline void homa_skb_stash_pages(struct homa *homa, int length) +{} + +#endif /* _HOMA_STUB_H */ diff --git a/homa_timer.c b/homa_timer.c index cc91170f..39fb0b19 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -1,184 +1,119 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file handles timing-related functions for Homa, such as retries - * and timeouts. */ + * and timeouts. + */ #include "homa_impl.h" +#include "homa_peer.h" +#include "homa_rpc.h" +#ifndef __STRIP__ /* See strip.py */ +#include "homa_grant.h" +#include "homa_skb.h" +#endif /* See strip.py */ + +#ifdef __STRIP__ /* See strip.py */ +#include "homa_stub.h" +#endif /* See strip.py */ /** - * homa_check_rpc() - Invoked for each RPC during each timer pass; does + * homa_timer_check_rpc() - Invoked for each RPC during each timer pass; does * most of the work of checking for time-related actions such as sending - * resends, declaring a host dead, and sending requests for acks. Itt is - * separate from homa_timer because homa_timer got too long and deeply - * indented. + * resends, aborting RPCs for which there is no response, and sending + * requests for acks. It is separate from homa_timer because homa_timer + * got too long and deeply indented. * @rpc: RPC to check; must be locked by the caller. - * Return Nonzero means this server has timed out; it's up to the caller - * to abort RPCs involving that server. */ -int homa_check_rpc(struct homa_rpc *rpc) +void homa_timer_check_rpc(struct homa_rpc *rpc) + __must_hold(rpc->bucket->lock) { - const char *us, *them; - struct resend_header resend; struct homa *homa = rpc->hsk->homa; - struct homa_peer *peer; + int tx_end = homa_rpc_tx_end(rpc); /* See if we need to request an ack for this RPC. */ - if (!homa_is_client(rpc->id) && (rpc->state == RPC_OUTGOING) - && (rpc->msgout.next_xmit_offset >= rpc->msgout.length)) { - if (rpc->done_timer_ticks == 0) + if (!homa_is_client(rpc->id) && rpc->state == RPC_OUTGOING && + tx_end == rpc->msgout.length) { + if (rpc->done_timer_ticks == 0) { rpc->done_timer_ticks = homa->timer_ticks; - else { + } else { /* >= comparison that handles tick wrap-around. */ if ((rpc->done_timer_ticks + homa->request_ack_ticks - - 1 - homa->timer_ticks) & 1<<31) { - struct need_ack_header h; + - 1 - homa->timer_ticks) & 1 << 31) { + struct homa_need_ack_hdr h; + homa_xmit_control(NEED_ACK, &h, sizeof(h), rpc); - tt_record4("Sent NEED_ACK for RPC id %d to " - "peer 0x%x, port %d, ticks %d", - rpc->id, - tt_addr(rpc->peer->addr), - rpc->dport, homa->timer_ticks - - rpc->done_timer_ticks); + tt_record4("Sent NEED_ACK for RPC id %d to peer 0x%x, port %d, ticks %d", + rpc->id, + tt_addr(rpc->peer->addr), + rpc->dport, homa->timer_ticks + - rpc->done_timer_ticks); } } - - /* We don't want to send RESENDs for RPCs in this state. */ - rpc->resend_timer_ticks = homa->timer_ticks; } - if ((rpc->state == RPC_OUTGOING) && (rpc->msgout.next_xmit_offset - < rpc->msgout.granted)) { - /* There are granted bytes that we haven't transmitted, so - * no need to be concerned about lack of traffic from the peer. + if (rpc->state == RPC_INCOMING) { +#ifndef __STRIP__ /* See strip.py */ + if ((rpc->msgin.length - rpc->msgin.bytes_remaining) + >= rpc->msgin.granted) { + /* We've received everything that we've granted, so we + * shouldn't expect to hear anything until we grant more. + */ + rpc->silent_ticks = 0; + return; + } +#endif /* See strip.py */ + if (rpc->msgin.num_bpages == 0) { + /* Waiting for buffer space, so no problem. */ + rpc->silent_ticks = 0; + return; + } + } else if (!homa_is_client(rpc->id)) { + /* We're the server and we've received the input message; + * no need to worry about retries. */ rpc->silent_ticks = 0; - return 0; + return; } - if ((rpc->state == RPC_INCOMING) && ((rpc->msgin.total_length - - rpc->msgin.bytes_remaining) - >= rpc->msgin.incoming)) { - /* We've received everything that we've granted, so we - * shouldn't expect to hear anything until we grant more. - */ - rpc->silent_ticks = 0; - return 0; + if (rpc->state == RPC_OUTGOING) { +#ifndef __STRIP__ /* See strip.py */ + if (tx_end < rpc->msgout.granted) { +#else /* See strip.py */ + if (tx_end < rpc->msgout.length) { +#endif /* See strip.py */ + /* There are granted bytes that we haven't transmitted, + * so no need to be concerned; the ball is in our court. + */ + rpc->silent_ticks = 0; + return; + } } - /* The -1 below is so that this RPC in considered in the - * computation of peer->least_recent_rpc just before it reaches - * homa->resend_ticks; the resend won't actually occur for - * another tick. - */ - if (rpc->silent_ticks < (homa->resend_ticks-1)) - return 0; - - peer = rpc->peer; - if (peer->outstanding_resends - >= rpc->hsk->homa->timeout_resends) { - INC_METRIC(peer_timeouts, 1); - tt_record4("peer 0x%x timed out for RPC id %d, " - "state %d, outstanding_resends %d", - tt_addr(peer->addr), rpc->id, rpc->state, - peer->outstanding_resends); + if (rpc->silent_ticks < homa->resend_ticks) + return; + if (rpc->silent_ticks >= homa->timeout_ticks) { + INC_METRIC(rpc_timeouts, 1); + tt_record3("RPC id %d, peer 0x%x, aborted because of timeout, state %d", + rpc->id, tt_addr(rpc->peer->addr), rpc->state); +#ifndef __STRIP__ /* See strip.py */ +#if 0 + homa_rpc_log_active_tt(homa, 0); + tt_record1("Freezing because of RPC abort (id %d)", rpc->id); + homa_freeze_peers(); + tt_freeze(); +#endif if (homa->verbose) - printk(KERN_NOTICE "Homa peer %s timed out, id %llu", - homa_print_ipv6_addr(&peer->addr), - rpc->id); - homa_freeze(rpc, PEER_TIMEOUT, "Freezing because of peer " - "timeout, id %d, peer 0x%x"); - peer->outstanding_resends = 0; - return 1; - } - - /* Resends serve two purposes: to force retransmission of lost packets, - * and to detect if servers have crashed. We only send one resend to - * a given peer at a time: if many RPCs need resends to the same peer, - * it's almost certainly because the peer is overloaded, so we don't - * want to add to its load by sending lots of resends; we just want to - * make sure that it is still alive. However, if there are multiple - * RPCs that need resends, we need to rotate between them, so that - * every RPC eventually gets a resend. In earlier versions of Homa - * we only sent to the oldest RPC, but this led to distributed - * deadlock in situations where the oldest RPC can't make progress - * until some other RPC makes progress (e.g. a server is waiting - * to receive one RPC before it replies to another, or some RPC is - * first on @peer->grantable_rpcs, so it blocks transmissions of - * other RPCs). - */ - - /* First, collect information that will identify the RPC most - * in need of a resend; this will be used during the *next* - * homa_timer pass. - */ - if (peer->current_ticks != homa->timer_ticks) { - /* Reset info for this peer.*/ - peer->resend_rpc = peer->least_recent_rpc; - peer->least_recent_rpc = NULL; - peer->least_recent_ticks = homa->timer_ticks; - peer->current_ticks = homa->timer_ticks; - } - - if ((rpc != peer->resend_rpc) || - (homa->timer_ticks - rpc->peer->most_recent_resend) - < homa->resend_interval) { - /* We're not sending a resend to this RPC now. Update info - * about the best RPC for the next resend. Note: comparing - * values in the face of wrap-around and compiler - * optimizations is tricky; don't change the comparison below - * unless you're sure you know what you are doing. - */ - if (!((peer->least_recent_ticks - rpc->resend_timer_ticks) - & (1U<<31))) { - peer->least_recent_rpc = rpc; - peer->least_recent_ticks = rpc->resend_timer_ticks; - } - return 0; - } - - /* Issue a resend for this RPC. */ - rpc->resend_timer_ticks = homa->timer_ticks; - rpc->peer->most_recent_resend = homa->timer_ticks; - rpc->peer->outstanding_resends++; - homa_get_resend_range(&rpc->msgin, &resend); - resend.priority = homa->num_priorities-1; - homa_xmit_control(RESEND, &resend, sizeof(resend), rpc); - if (homa_is_client(rpc->id)) { - us = "client"; - them = "server"; - tt_record4("Sent RESEND for client RPC id %llu, server 0x%x:%d, " - "offset %d", - rpc->id, tt_addr(rpc->peer->addr), - rpc->dport, ntohl(resend.offset)); - } else { - us = "server"; - them = "client"; - tt_record4("Sent RESEND for server RPC id %llu, client 0x%x:%d " - "offset %d", - rpc->id, tt_addr(rpc->peer->addr), - rpc->dport, ntohl(resend.offset)); - } - if (homa->verbose) { - printk(KERN_NOTICE "Homa %s RESEND to %s %s:%d for id %llu, " - "offset %d, length %d", us, them, - homa_print_ipv6_addr(&rpc->peer->addr), - rpc->dport, rpc->id, ntohl(resend.offset), - ntohl(resend.length)); + pr_notice("RPC id %llu, peer %s, aborted because of timeout, state %d\n", + rpc->id, + homa_print_ipv6_addr(&rpc->peer->addr), + rpc->state); +#endif /* See strip.py */ + homa_rpc_abort(rpc, -ETIMEDOUT); + return; } - return 0; + if (((rpc->silent_ticks - homa->resend_ticks) % homa->resend_interval) + == 0) + homa_request_retrans(rpc); } /** @@ -191,31 +126,73 @@ void homa_timer(struct homa *homa) struct homa_socktab_scan scan; struct homa_sock *hsk; struct homa_rpc *rpc; - cycles_t start, end; - struct homa_peer *dead_peer = NULL; int rpc_count = 0; - int total_rpcs = 0; +#ifndef __STRIP__ /* See strip.py */ + static u64 prev_grant_count; int total_incoming_rpcs = 0; - int total_incoming_bytes = 0; + int sum_incoming_rec = 0; + static int zero_count; + int sum_incoming = 0; + int total_rpcs = 0; + u64 total_grants; + cycles_t start; + cycles_t end; + int core; +#endif /* See strip.py */ - start = get_cycles(); homa->timer_ticks++; - /* Scan all existing RPCs in all sockets. The rcu_read_lock - * below prevents sockets from being deleted during the scan. - */ - rcu_read_lock(); - for (hsk = homa_socktab_start_scan(&homa->port_map, &scan); - hsk != NULL; hsk = homa_socktab_next(&scan)) { +#ifndef __STRIP__ /* See strip.py */ + start = homa_clock(); + total_grants = 0; + for (core = 0; core < nr_cpu_ids; core++) { + struct homa_metrics *m = homa_metrics_per_cpu(); + + total_grants += m->packets_sent[GRANT - DATA]; + } + + if (atomic_read(&homa->grant->total_incoming) != 0 || + homa->grant->num_grantable_rpcs != 0 || + homa->grant->num_active_rpcs != 0 || + total_grants - prev_grant_count != 0) + tt_record4("homa_timer found total_incoming %d, num_grantable_rpcs %d, num_active_rpcs %d, new grants %d", + atomic_read(&homa->grant->total_incoming), + homa->grant->num_grantable_rpcs, + homa->grant->num_active_rpcs, + total_grants - prev_grant_count); + if (total_grants == prev_grant_count && + homa->grant->num_grantable_rpcs > 20) { + zero_count++; + if (zero_count > 3 && !atomic_read(&tt_frozen) && 0) { + pr_err("%s found no grants going out\n", __func__); + homa_rpc_log_active_tt(homa, 0); + tt_record("freezing because no grants are going out"); + homa_freeze_peers(); + tt_freeze(); + } + } else { + zero_count = 0; + } + prev_grant_count = total_grants; +#endif /* See strip.py */ + + /* Scan all existing RPCs in all sockets. */ + for (hsk = homa_socktab_start_scan(homa->socktab, &scan); + hsk; hsk = homa_socktab_next(&scan)) { while (hsk->dead_skbs >= homa->dead_buffs_limit) { - /* If we get here, it means that homa_wait_for_message - * isn't keeping up with RPC reaping, so we'll help - * out. See reap.txt for more info. */ - uint64_t start = get_cycles(); + /* If we get here, it means that Homa isn't keeping + * up with RPC reaping, so we'll help out. See + * "RPC Reaping Strategy" in homa_rpc_reap code for + * details. + */ +#ifndef __STRIP__ /* See strip.py */ + u64 rpc_start = homa_clock(); +#endif /* See strip.py */ + tt_record("homa_timer calling homa_rpc_reap"); - if (homa_rpc_reap(hsk, hsk->homa->reap_limit) == 0) + if (homa_rpc_reap(hsk, false) == 0) break; - INC_METRIC(timer_reap_cycles, get_cycles() - start); + INC_METRIC(timer_reap_cycles, homa_clock() - rpc_start); } if (list_empty(&hsk->active_rpcs) || hsk->shutdown) @@ -223,26 +200,31 @@ void homa_timer(struct homa *homa) if (!homa_protect_rpcs(hsk)) continue; + rcu_read_lock(); list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { - total_rpcs++; + IF_NO_STRIP(total_rpcs++); + homa_rpc_lock(rpc); if (rpc->state == RPC_IN_SERVICE) { rpc->silent_ticks = 0; homa_rpc_unlock(rpc); continue; +#ifndef __STRIP__ /* See strip.py */ } else if (rpc->state == RPC_INCOMING) { - total_incoming_rpcs++; - total_incoming_bytes += rpc->msgin.total_length; + total_incoming_rpcs += 1; + sum_incoming_rec += rpc->msgin.rec_incoming; + sum_incoming += rpc->msgin.granted + - (rpc->msgin.length + - rpc->msgin.bytes_remaining); +#endif /* See strip.py */ } rpc->silent_ticks++; - if (homa_check_rpc(rpc)) - dead_peer = rpc->peer; + homa_timer_check_rpc(rpc); homa_rpc_unlock(rpc); rpc_count++; if (rpc_count >= 10) { /* Give other kernel threads a chance to run - * on this core. Must release the RCU read lock - * while doing this. + * on this core. */ rcu_read_unlock(); schedule(); @@ -250,23 +232,21 @@ void homa_timer(struct homa *homa) rpc_count = 0; } } + rcu_read_unlock(); homa_unprotect_rpcs(hsk); } - rcu_read_unlock(); - if (dead_peer) { - /* We only timeout one peer per call to this function (it's - * tricky from a synchronization standpoint to handle the - * crash in the middle of the loop above, and trying to - * remember more than one dead peer until we get here adds - * complexity). If there's more than one dead peer, we'll - * timeout another one in the next call. - */ - homa_abort_rpcs(homa, &dead_peer->addr, 0, -ETIMEDOUT); - } - -// if (total_rpcs > 0) -// tt_record1("homa_timer finished scanning %d RPCs", total_rpcs); - - end = get_cycles(); - INC_METRIC(timer_cycles, end-start); + homa_socktab_end_scan(&scan); +#ifndef __STRIP__ /* See strip.py */ + if (total_incoming_rpcs > 0) + tt_record4("homa_timer found %d incoming RPCs, incoming sum %d, rec_sum %d, homa->total_incoming %d", + total_incoming_rpcs, sum_incoming, sum_incoming_rec, + atomic_read(&homa->grant->total_incoming)); +#endif /* See strip.py */ + homa_skb_release_pages(homa); + homa_peer_gc(homa->peertab); +#ifndef __STRIP__ /* See strip.py */ + homa_snapshot_rpcs(); + end = homa_clock(); + INC_METRIC(timer_cycles, end - start); +#endif /* See strip.py */ } diff --git a/homa_utils.c b/homa_utils.c index be3880f2..e11fe969 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -1,31 +1,21 @@ -/* Copyright (c) 2019-2023 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -/* This file contains miscellaneous utility functions for the Homa protocol. */ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ -#include "homa_impl.h" - -/* Core-specific information. NR_CPUS is an overestimate of the actual - * number, but allows us to allocate the array statically. +/* This file contains miscellaneous utility functions for Homa, such + * as initializing and destroying homa structs. */ -struct homa_core *homa_cores[NR_CPUS]; -/* Points to block of memory holding all homa_cores; used to free it. */ -char *core_memory; +#include "homa_impl.h" +#include "homa_peer.h" +#include "homa_rpc.h" -struct completion homa_pacer_kthread_done; +#ifndef __STRIP__ /* See strip.py */ +#include "homa_grant.h" +#include "homa_pacer.h" +#include "homa_qdisc.h" +#include "homa_skb.h" +#else /* See strip.py */ +#include "homa_stub.h" +#endif /* See strip.py */ /** * homa_init() - Constructor for homa objects. @@ -37,78 +27,65 @@ struct completion homa_pacer_kthread_done; */ int homa_init(struct homa *homa) { - size_t aligned_size; - char *first; - int i, err; - _Static_assert(HOMA_MAX_PRIORITIES >= 8, - "homa_init assumes at least 8 priority levels"); + int err; - /* Initialize core-specific info (if no-one else has already done it), - * making sure that each core has private cache lines. - */ - if (!core_memory) { - aligned_size = (sizeof(struct homa_core) + 0x3f) & ~0x3f; - core_memory = vmalloc(0x3f + (nr_cpu_ids*aligned_size)); - if (!core_memory) { - printk(KERN_ERR "Homa couldn't allocate memory " - "for core-specific data\n"); - return -ENOMEM; - } - first = (char *) (((__u64) core_memory + 0x3f) & ~0x3f); - for (i = 0; i < nr_cpu_ids; i++) { - struct homa_core *core; - core = (struct homa_core *) (first + i*aligned_size); - homa_cores[i] = core; - core->last_active = 0; - core->last_gro = 0; - atomic_set(&core->softirq_backlog, 0); - core->softirq_offset = 0; - core->held_skb = NULL; - core->held_bucket = 0; - memset(&core->metrics, 0, sizeof(core->metrics)); - } - } + IF_NO_STRIP(int i); + + memset(homa, 0, sizeof(*homa)); - homa->pacer_kthread = NULL; - init_completion(&homa_pacer_kthread_done); atomic64_set(&homa->next_outgoing_id, 2); - atomic64_set(&homa->link_idle_time, get_cycles()); - spin_lock_init(&homa->grantable_lock); - INIT_LIST_HEAD(&homa->grantable_peers); - homa->num_grantable_peers = 0; - homa->grant_nonfifo = 0; - homa->grant_nonfifo_left = 0; - spin_lock_init(&homa->pacer_mutex); - homa->pacer_fifo_fraction = 50; - homa->pacer_fifo_count = 1; - homa->pacer_wake_time = 0; - spin_lock_init(&homa->throttle_lock); - INIT_LIST_HEAD_RCU(&homa->throttled_rpcs); - homa->throttle_add = 0; - homa->throttle_min_bytes = 1000; - atomic_set(&homa->total_incoming, 0); - homa->next_client_port = HOMA_MIN_DEFAULT_PORT; - homa_socktab_init(&homa->port_map); - err = homa_peertab_init(&homa->peers); + homa->link_mbps = 25000; +#ifndef __STRIP__ /* See strip.py */ + homa->qshared = homa_qdisc_shared_alloc(); + if (IS_ERR(homa->qshared)) { + err = PTR_ERR(homa->qshared); + homa->qshared = NULL; + return err; + } + homa->pacer = homa_pacer_alloc(homa); + if (IS_ERR(homa->pacer)) { + err = PTR_ERR(homa->pacer); + homa->pacer = NULL; + return err; + } + homa->grant = homa_grant_alloc(homa); + if (IS_ERR(homa->grant)) { + err = PTR_ERR(homa->grant); + homa->grant = NULL; + return err; + } +#endif /* See strip.py */ + homa->peertab = homa_peer_alloc_peertab(); + if (IS_ERR(homa->peertab)) { + err = PTR_ERR(homa->peertab); + homa->peertab = NULL; + return err; + } + homa->socktab = kmalloc(sizeof(*homa->socktab), GFP_KERNEL); + if (!homa->socktab) + return -ENOMEM; + homa_socktab_init(homa->socktab); +#ifndef __STRIP__ /* See strip.py */ + err = homa_skb_init(homa); if (err) { - printk(KERN_ERR "Couldn't initialize peer table (errno %d)\n", - -err); + pr_err("Couldn't initialize skb management (errno %d)\n", + -err); return err; } +#endif /* See strip.py */ /* Wild guesses to initialize configuration values... */ - homa->rtt_bytes = 10000; - homa->max_grant_window = 0; - homa->link_mbps = 10000; +#ifndef __STRIP__ /* See strip.py */ + homa->unsched_bytes = 40000; homa->poll_usecs = 50; homa->num_priorities = HOMA_MAX_PRIORITIES; for (i = 0; i < HOMA_MAX_PRIORITIES; i++) homa->priority_map[i] = i; homa->max_sched_prio = HOMA_MAX_PRIORITIES - 5; - homa->unsched_cutoffs[HOMA_MAX_PRIORITIES-1] = 200; - homa->unsched_cutoffs[HOMA_MAX_PRIORITIES-2] = 2800; - homa->unsched_cutoffs[HOMA_MAX_PRIORITIES-3] = 15000; - homa->unsched_cutoffs[HOMA_MAX_PRIORITIES-4] = HOMA_MAX_MESSAGE_LENGTH; + homa->unsched_cutoffs[HOMA_MAX_PRIORITIES - 1] = 200; + homa->unsched_cutoffs[HOMA_MAX_PRIORITIES - 2] = 2800; + homa->unsched_cutoffs[HOMA_MAX_PRIORITIES - 3] = 15000; + homa->unsched_cutoffs[HOMA_MAX_PRIORITIES - 4] = HOMA_MAX_MESSAGE_LENGTH; #ifdef __UNIT_TEST__ /* Unit tests won't send CUTOFFS messages unless the test changes * this variable. @@ -117,1478 +94,101 @@ int homa_init(struct homa *homa) #else homa->cutoff_version = 1; #endif - homa->fifo_grant_increment = 10000; - homa->grant_fifo_fraction = 50; - homa->duty_cycle = 800; - homa->grant_threshold = 0; - homa->max_overcommit = 8; - homa->max_incoming = 0; - homa->resend_ticks = 15; - homa->resend_interval = 10; +#endif /* See strip.py */ + homa->resend_ticks = 5; + homa->resend_interval = 5; + homa->timeout_ticks = 100; homa->timeout_resends = 5; homa->request_ack_ticks = 2; homa->reap_limit = 10; homa->dead_buffs_limit = 5000; - homa->max_dead_buffs = 0; - homa->pacer_kthread = kthread_run(homa_pacer_main, homa, - "homa_pacer"); - if (IS_ERR(homa->pacer_kthread)) { - err = PTR_ERR(homa->pacer_kthread); - homa->pacer_kthread = NULL; - printk(KERN_ERR "couldn't create homa pacer thread: error %d\n", - err); - return err; - } - homa->pacer_exit = false; - homa->max_nic_queue_ns = 2000; - homa->cycles_per_kbyte = 0; +#ifndef __STRIP__ /* See strip.py */ homa->verbose = 0; +#endif /* See strip.py */ homa->max_gso_size = 10000; + homa->wmem_max = 100000000; +#ifndef __STRIP__ /* See strip.py */ homa->max_gro_skbs = 20; - homa->gso_force_software = 0; homa->gro_policy = HOMA_GRO_NORMAL; - homa->gro_busy_usecs = 10; - homa->timer_ticks = 0; - spin_lock_init(&homa->metrics_lock); - homa->metrics = NULL; - homa->metrics_capacity = 0; - homa->metrics_length = 0; - homa->metrics_active_opens = 0; - homa->flags = 0; - homa->freeze_type = 0; - homa->sync_freeze = 0; + homa->busy_usecs = 100; + homa->gro_busy_usecs = 5; +#endif /* See strip.py */ homa->bpage_lease_usecs = 10000; - homa_outgoing_sysctl_changed(homa); +#ifndef __STRIP__ /* See strip.py */ homa_incoming_sysctl_changed(homa); +#endif /* See strip.py */ return 0; } /** * homa_destroy() - Destructor for homa objects. - * @homa: Object to destroy. + * @homa: Object to destroy. It is safe if this object has already + * been previously destroyed. */ void homa_destroy(struct homa *homa) { - int i; - if (homa->pacer_kthread) { - homa_pacer_stop(homa); - wait_for_completion(&homa_pacer_kthread_done); - } - - /* The order of the following 2 statements matters! */ - homa_socktab_destroy(&homa->port_map); - homa_peertab_destroy(&homa->peers); - if (core_memory) { - vfree(core_memory); - core_memory = NULL; - for (i = 0; i < nr_cpu_ids; i++) { - homa_cores[i] = NULL; - } - } - if (homa->metrics) - kfree(homa->metrics); -} - -/** - * homa_rpc_new_client() - Allocate and construct a client RPC (one that is used - * to issue an outgoing request). Doesn't send any packets. Invoked with no - * locks held. - * @hsk: Socket to which the RPC belongs. - * @dest: Address of host (ip and port) to which the RPC will be sent. - * - * Return: A printer to the newly allocated object, or a negative - * errno if an error occurred. The RPC will be locked; the - * caller must eventually unlock it. - */ -struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, - const sockaddr_in_union *dest) -{ - int err; - struct homa_rpc *crpc; - struct homa_rpc_bucket *bucket; - struct in6_addr dest_addr_as_ipv6 = canonical_ipv6_addr(dest); - - crpc = (struct homa_rpc *) kmalloc(sizeof(*crpc), GFP_KERNEL); - if (unlikely(!crpc)) - return ERR_PTR(-ENOMEM); - - /* Initialize fields that don't require the socket lock. */ - crpc->hsk = hsk; - crpc->id = atomic64_fetch_add(2, &hsk->homa->next_outgoing_id); - bucket = homa_client_rpc_bucket(hsk, crpc->id); - crpc->lock = &bucket->lock; - crpc->state = RPC_OUTGOING; - atomic_set(&crpc->flags, 0); - atomic_set(&crpc->grants_in_progress, 0); - crpc->peer = homa_peer_find(&hsk->homa->peers, &dest_addr_as_ipv6, - &hsk->inet); - if (unlikely(IS_ERR(crpc->peer))) { - tt_record("error in homa_peer_find"); - err = PTR_ERR(crpc->peer); - goto error; - } - crpc->dport = ntohs(dest->in6.sin6_port); - crpc->completion_cookie = 0; - crpc->error = 0; - crpc->msgin.total_length = -1; - crpc->msgin.num_skbs = 0; - crpc->msgin.num_bpages = 0; - memset(&crpc->msgout, 0, sizeof(crpc->msgout)); - crpc->msgout.length = -1; - INIT_LIST_HEAD(&crpc->ready_links); - INIT_LIST_HEAD(&crpc->dead_links); - crpc->interest = NULL; - INIT_LIST_HEAD(&crpc->grantable_links); - INIT_LIST_HEAD(&crpc->throttled_links); - crpc->silent_ticks = 0; - crpc->resend_timer_ticks = hsk->homa->timer_ticks; - crpc->done_timer_ticks = 0; - crpc->magic = HOMA_RPC_MAGIC; - crpc->start_cycles = get_cycles(); - - /* Initialize fields that require locking. This allows the most - * expensive work, such as copying in the message from user space, - * to be performed without holding locks. Also, can't hold spin - * locks while doing things that could block, such as memory allocation. - */ - homa_bucket_lock(bucket, client); - homa_sock_lock(hsk, "homa_rpc_new_client"); - if (hsk->shutdown) { - homa_sock_unlock(hsk); - homa_rpc_unlock(crpc); - err = -ESHUTDOWN; - goto error; - } - hlist_add_head(&crpc->hash_links, &bucket->rpcs); - list_add_tail_rcu(&crpc->active_links, &hsk->active_rpcs); - homa_sock_unlock(hsk); - - return crpc; - -error: - kfree(crpc); - return ERR_PTR(err); -} - -/** - * homa_rpc_new_server() - Allocate and construct a server RPC (one that is - * used to manage an incoming request). If appropriate, the RPC will also - * be handed off (we do it here, while we have the socket locked, to avoid - * acquiring the socket lock a second time). - * @hsk: Socket that owns this RPC. - * @source: IP address (network byte order) of the RPC's client. - * @h: Header for the first data packet received for this RPC; used - * to initialize the RPC. - * - * Return: A pointer to a new RPC, which is locked, or a negative errno - * if an error occurred. If there is already an RPC corresponding - * to h, then it is returned instead of creating a new RPC. - */ -struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, - const struct in6_addr *source, struct data_header *h) -{ - int err; - struct homa_rpc *srpc = NULL; - __u64 id = homa_local_id(h->common.sender_id); - struct homa_rpc_bucket *bucket = homa_server_rpc_bucket(hsk, id); - - /* Lock the bucket, and make sure no-one else has already created - * the desired RPC. - */ - homa_bucket_lock(bucket, server); - hlist_for_each_entry_rcu(srpc, &bucket->rpcs, hash_links) { - if ((srpc->id == id) && - (srpc->dport == ntohs(h->common.sport)) && - ipv6_addr_equal(&srpc->peer->addr, source)) { - /* RPC already exists; just return it instead - * of creating a new RPC. - */ - return srpc; - } - } - - /* Initialize fields that don't require the socket lock. */ - srpc = (struct homa_rpc *) kmalloc(sizeof(*srpc), GFP_KERNEL); - if (!srpc) { - err = -ENOMEM; - goto error; - } - srpc->hsk = hsk; - srpc->lock = &bucket->lock; - srpc->state = RPC_INCOMING; - atomic_set(&srpc->flags, 0); - atomic_set(&srpc->grants_in_progress, 0); - srpc->peer = homa_peer_find(&hsk->homa->peers, source, &hsk->inet); - if (unlikely(IS_ERR(srpc->peer))) { - err = PTR_ERR(srpc->peer); - goto error; - } - srpc->dport = ntohs(h->common.sport); - srpc->id = id; - srpc->completion_cookie = 0; - srpc->error = 0; - srpc->msgin.total_length = -1; - srpc->msgin.num_skbs = 0; - srpc->msgin.num_bpages = 0; - memset(&srpc->msgout, 0, sizeof(srpc->msgout)); - srpc->msgout.length = -1; - INIT_LIST_HEAD(&srpc->ready_links); - INIT_LIST_HEAD(&srpc->dead_links); - srpc->interest = NULL; - INIT_LIST_HEAD(&srpc->grantable_links); - INIT_LIST_HEAD(&srpc->throttled_links); - srpc->silent_ticks = 0; - srpc->resend_timer_ticks = hsk->homa->timer_ticks; - srpc->done_timer_ticks = 0; - srpc->magic = HOMA_RPC_MAGIC; - srpc->start_cycles = get_cycles(); - - /* Initialize fields that require socket to be locked. */ - homa_sock_lock(hsk, "homa_rpc_new_server"); - if (hsk->shutdown) { - homa_sock_unlock(hsk); - err = -ESHUTDOWN; - goto error; - } - hlist_add_head(&srpc->hash_links, &bucket->rpcs); - list_add_tail_rcu(&srpc->active_links, &hsk->active_rpcs); - if (ntohl(h->seg.offset) == 0) { - atomic_or(RPC_PKTS_READY, &srpc->flags); - homa_rpc_handoff(srpc); - } - homa_sock_unlock(hsk); - INC_METRIC(requests_received, 1); - return srpc; - -error: - spin_unlock_bh(&bucket->lock); - if (srpc) - kfree(srpc); - return ERR_PTR(err); -} - -/** - * homa_rpc_lock_slow() - This function implements the slow path for - * acquiring an RPC lock. It is invoked when an RPC lock isn't immediately - * available. It waits for the lock, but also records statistics about - * the waiting time. - * @rpc: RPC to lock. - */ -void homa_rpc_lock_slow(struct homa_rpc *rpc) -{ - __u64 start = get_cycles(); - tt_record("beginning wait for rpc lock"); - spin_lock_bh(rpc->lock); - tt_record("ending wait for rpc lock"); - if (homa_is_client(rpc->id)) { - INC_METRIC(client_lock_misses, 1); - INC_METRIC(client_lock_miss_cycles, get_cycles() - start); - } else { - INC_METRIC(server_lock_misses, 1); - INC_METRIC(server_lock_miss_cycles, get_cycles() - start); - } -} - -/** - * homa_rpc_acked() - This function is invoked when an ack is received - * for an RPC; if the RPC still exists, is freed. - * @hsk: Socket on which the ack was received. May or may not correspond - * to the RPC, but can sometimes be used to avoid a socket lookup. - * @saddr: Source address from which the act was received (the client - * note for the RPC) - * @ack: Information about an RPC from @saddr that may now be deleted safely. - */ -void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, - struct homa_ack *ack) -{ - struct homa_rpc *rpc; - struct homa_sock *hsk2 = hsk; - __u64 id = homa_local_id(ack->client_id); - __u16 client_port = ntohs(ack->client_port); - __u16 server_port = ntohs(ack->server_port); - - if (hsk2->port != server_port) { - /* Without RCU, sockets other than hsk can be deleted - * out from under us. - */ - rcu_read_lock(); - hsk2 = homa_sock_find(&hsk->homa->port_map, server_port); - if (!hsk2) - goto done; - } - rpc = homa_find_server_rpc(hsk2, saddr, client_port, id); - if (rpc) { - homa_rpc_free(rpc); - homa_rpc_unlock(rpc); - } - - done: - if (hsk->port != server_port) - rcu_read_unlock(); -} - -/** - * homa_rpc_free() - Destructor for homa_rpc; will arrange for all resources - * associated with the RPC to be released (eventually). - * @rpc: Structure to clean up, or NULL. Must be locked. Its socket must - * not be locked. - */ -void homa_rpc_free(struct homa_rpc *rpc) -{ - int delta; - if (!rpc || (rpc->state == RPC_DEAD)) - return; - - /* Before doing anything else, unlink the input message from - * homa->grantable_msgs. This will synchronize to ensure that - * homa_manage_grants doesn't access this RPC after destruction - * begins. - */ - rpc->state = RPC_DEAD; - homa_remove_from_grantable(rpc->hsk->homa, rpc); - - /* Unlink from all lists, so no-one will ever find this RPC again. */ - homa_sock_lock(rpc->hsk, "homa_rpc_free"); - __hlist_del(&rpc->hash_links); - list_del_rcu(&rpc->active_links); - list_add_tail_rcu(&rpc->dead_links, &rpc->hsk->dead_rpcs); - rpc->hsk->dead_skbs += rpc->msgin.num_skbs + rpc->msgout.num_skbs; - if (rpc->hsk->dead_skbs > rpc->hsk->homa->max_dead_buffs) - /* This update isn't thread-safe; it's just a - * statistic so it's OK if updates occasionally get - * missed. - */ - rpc->hsk->homa->max_dead_buffs = rpc->hsk->dead_skbs; - __list_del_entry(&rpc->ready_links); - if (rpc->interest != NULL) { - rpc->interest->reg_rpc = NULL; - wake_up_process(rpc->interest->thread); - rpc->interest = NULL; - } -// tt_record3("Freeing rpc id %d, socket %d, dead_skbs %d", rpc->id, -// rpc->hsk->client_port, -// rpc->hsk->dead_skbs); - - /* If the RPC had incoming bytes, remove them from the global count. */ - delta = (rpc->msgin.total_length < 0) ? 0 - : (rpc->msgin.incoming - (rpc->msgin.total_length - - rpc->msgin.bytes_remaining)); - if (delta != 0) - atomic_add(-delta, &rpc->hsk->homa->total_incoming); - if (unlikely(rpc->msgin.num_bpages)) - homa_pool_release_buffers(&rpc->hsk->buffer_pool, - rpc->msgin.num_bpages, rpc->msgin.bpage_offsets); - - homa_sock_unlock(rpc->hsk); - homa_remove_from_throttled(rpc); -} - -/** - * homa_rpc_reap() - Invoked to release resources associated with dead - * RPCs for a given socket. For a large RPC, it can take a long time to - * free all of its packet buffers, so we try to perform this work - * off the critical path where it won't delay applications. Each call to - * this function does a small chunk of work. See the file reap.txt for - * more information. - * @hsk: Homa socket that may contain dead RPCs. Must not be locked by the - * caller; this function will lock and release. - * @count: Number of buffers to free during this call. - * - * Return: A return value of 0 means that we ran out of work to do; calling - * again will do no work (there could be unreaped RPCs, but if so, - * reaping has been disabled for them). A value greater than - * zero means there is still more reaping work to be done. - */ -int homa_rpc_reap(struct homa_sock *hsk, int count) -{ -#ifdef __UNIT_TEST__ -#define BATCH_MAX 3 -#else -#define BATCH_MAX 20 -#endif - struct sk_buff *skbs[BATCH_MAX]; - struct homa_rpc *rpcs[BATCH_MAX]; - int num_skbs, num_rpcs; - struct homa_rpc *rpc; - int i, batch_size; - int result; - - INC_METRIC(reaper_calls, 1); - INC_METRIC(reaper_dead_skbs, hsk->dead_skbs); - - /* Each iteration through the following loop will reap - * BATCH_MAX skbs. - */ - while (count > 0) { - batch_size = count; - if (batch_size > BATCH_MAX) - batch_size = BATCH_MAX; - count -= batch_size; - num_skbs = num_rpcs = 0; - - homa_sock_lock(hsk, "homa_rpc_reap"); - if (atomic_read(&hsk->protect_count)) { - INC_METRIC(disabled_reaps, 1); - tt_record2("homa_rpc_reap returning: protect_count " - "%d, dead_skbs %d", - atomic_read(&hsk->protect_count), - hsk->dead_skbs); - homa_sock_unlock(hsk); - return 0; - } - - /* Collect buffers and freeable RPCs. */ - list_for_each_entry_rcu(rpc, &hsk->dead_rpcs, dead_links) { - if ((atomic_read(&rpc->flags) & RPC_CANT_REAP) - || (atomic_read(&rpc->grants_in_progress) - != 0) - || (atomic_read(&rpc->msgout.active_xmits) - != 0)) { - INC_METRIC(disabled_rpc_reaps, 1); - continue; - } - rpc->magic = 0; - if (rpc->msgout.length >= 0) { - while (rpc->msgout.packets) { - skbs[num_skbs] = rpc->msgout.packets; - rpc->msgout.packets = homa_get_skb_info( - rpc->msgout.packets) - ->next_skb; - num_skbs++; - rpc->msgout.num_skbs--; - if (num_skbs >= batch_size) - goto release; - } - } - i = 0; - if (rpc->msgin.total_length >= 0) { - while (1) { - struct sk_buff *skb; - skb = skb_dequeue(&rpc->msgin.packets); - if (!skb) - break; - skbs[num_skbs] = skb; - num_skbs++; - rpc->msgin.num_skbs--; - if (num_skbs >= batch_size) - goto release; - } - } - - /* If we get here, it means all packets have been - * removed from the RPC. - */ - rpcs[num_rpcs] = rpc; - num_rpcs++; - list_del_rcu(&rpc->dead_links); - if (num_rpcs >= batch_size) - goto release; - } - - /* Free all of the collected resources; release the socket - * lock while doing this. - */ - release: - hsk->dead_skbs -= num_skbs; - result = !list_empty(&hsk->dead_rpcs) - && ((num_skbs + num_rpcs) != 0); - homa_sock_unlock(hsk); - for (i = 0; i < num_skbs; i++) - kfree_skb(skbs[i]); - for (i = 0; i < num_rpcs; i++) { - UNIT_LOG("; ", "reaped %llu", rpcs[i]->id); - /* Lock and unlock the RPC before freeing it. This - * is needed to deal with races where the last user - * of the RPC (such as homa_ioc_reply) hasn't - * unlocked it yet. - */ - homa_rpc_lock(rpcs[i]); - homa_rpc_unlock(rpcs[i]); - rpcs[i]->state = 0; - kfree(rpcs[i]); - } - tt_record4("reaped %d skbs, %d rpcs; %d skbs remain for port %d", - num_skbs, num_rpcs, hsk->dead_skbs, hsk->port); - if (!result) - break; - } - return result; -} - -/** - * homa_find_client_rpc() - Locate client-side information about the RPC that - * a packet belongs to, if there is any. Thread-safe without socket lock. - * @hsk: Socket via which packet was received. - * @id: Unique identifier for the RPC. - * - * Return: A pointer to the homa_rpc for this id, or NULL if none. - * The RPC will be locked; the caller must eventually unlock it - * by invoking homa_rpc_unlock. - */ -struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, __u64 id) -{ - struct homa_rpc *crpc; - struct homa_rpc_bucket *bucket = homa_client_rpc_bucket(hsk, id); - homa_bucket_lock(bucket, client); - hlist_for_each_entry_rcu(crpc, &bucket->rpcs, hash_links) { - if (crpc->id == id) { - return crpc; - } - } - spin_unlock_bh(&bucket->lock); - return NULL; -} - -/** - * homa_find_server_rpc() - Locate server-side information about the RPC that - * a packet belongs to, if there is any. Thread-safe without socket lock. - * @hsk: Socket via which packet was received. - * @saddr: Address from which the packet was sent. - * @sport: Port at @saddr from which the packet was sent. - * @id: Unique identifier for the RPC (must have server bit set). - * - * Return: A pointer to the homa_rpc matching the arguments, or NULL - * if none. The RPC will be locked; the caller must eventually - * unlock it by invoking homa_rpc_unlock. - */ -struct homa_rpc *homa_find_server_rpc(struct homa_sock *hsk, - const struct in6_addr *saddr, __u16 sport, __u64 id) -{ - struct homa_rpc *srpc; - struct homa_rpc_bucket *bucket = homa_server_rpc_bucket(hsk, id); - homa_bucket_lock(bucket, server); - hlist_for_each_entry_rcu(srpc, &bucket->rpcs, hash_links) { - if ((srpc->id == id) && (srpc->dport == sport) && - ipv6_addr_equal(&srpc->peer->addr, saddr)) { - return srpc; - } - } - spin_unlock_bh(&bucket->lock); - return NULL; -} - -/** - * homa_rpc_log() - Log info about a particular RPC; this is functionality - * pulled out of homa_rpc_log_active because its indentation got too deep. - * @rpc: RPC for which key info should be written to the system log. - */ -void homa_rpc_log(struct homa_rpc *rpc) -{ - char *type = homa_is_client(rpc->id) ? "Client" : "Server"; - char *peer = homa_print_ipv6_addr(&rpc->peer->addr); - - if (rpc->state == RPC_INCOMING) - printk(KERN_NOTICE "%s RPC INCOMING, id %llu, peer %s:%d, " - "%d/%d bytes received, incoming %d\n", - type, rpc->id, peer, rpc->dport, - rpc->msgin.total_length - - rpc->msgin.bytes_remaining, - rpc->msgin.total_length, rpc->msgin.incoming); - else if (rpc->state == RPC_OUTGOING) { - printk(KERN_NOTICE "%s RPC OUTGOING, id %llu, peer %s:%d, " - "out length %d, left %d, granted %d, " - "in left %d, resend_ticks %u, silent_ticks %d\n", - type, rpc->id, peer, rpc->dport, - rpc->msgout.length, - rpc->msgout.length - rpc->msgout.next_xmit_offset, - rpc->msgout.granted, - rpc->msgin.bytes_remaining, - rpc->resend_timer_ticks, - rpc->silent_ticks); - } else { - printk(KERN_NOTICE "%s RPC %s, id %llu, peer %s:%d, " - "incoming length %d, outgoing length %d\n", - type, homa_symbol_for_state(rpc), - rpc->id, peer, rpc->dport, - rpc->msgin.total_length, rpc->msgout.length); - } -} - -/** - * homa_rpc_log_active() - Print information to the system log about all - * active RPCs. Intended primarily for debugging. - * @homa: Overall data about the Homa protocol implementation. - * @id: An RPC id: if nonzero, then only RPCs with this id will be - * logged. - */ -void homa_rpc_log_active(struct homa *homa, uint64_t id) -{ - struct homa_socktab_scan scan; - struct homa_sock *hsk; - struct homa_rpc *rpc; - int count = 0; - - printk("Logging active Homa RPCs:\n"); - rcu_read_lock(); - for (hsk = homa_socktab_start_scan(&homa->port_map, &scan); - hsk != NULL; hsk = homa_socktab_next(&scan)) { - if (list_empty(&hsk->active_rpcs) || hsk->shutdown) - continue; - - if (!homa_protect_rpcs(hsk)) - continue; - list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { - count++; - if ((id != 0) && (id != rpc->id)) - continue; - homa_rpc_log(rpc); - } - homa_unprotect_rpcs(hsk); - } - rcu_read_unlock(); - printk("Finished logging active Homa RPCs: %d active RPCs\n", count); -} - -/** - * homa_print_ipv4_addr() - Convert an IPV4 address to the standard string - * representation. - * @addr: Address to convert, in network byte order. - * - * Return: The converted value. Values are stored in static memory, so - * the caller need not free. This also means that storage is - * eventually reused (there are enough buffers to accommodate - * multiple "active" values). - * - * Note: Homa uses this function, rather than the %pI4 format specifier - * for snprintf et al., because the kernel's version of snprintf isn't - * available in Homa's unit test environment. - */ -char *homa_print_ipv4_addr(__be32 addr) -{ -#define NUM_BUFS_IPV4 4 -#define BUF_SIZE_IPV4 30 - static char buffers[NUM_BUFS_IPV4][BUF_SIZE_IPV4]; - static int next_buf = 0; - __u32 a2 = ntohl(addr); - char *buffer = buffers[next_buf]; - next_buf++; - if (next_buf >= NUM_BUFS_IPV4) - next_buf = 0; - snprintf(buffer, BUF_SIZE_IPV4, "%u.%u.%u.%u", (a2 >> 24) & 0xff, - (a2 >> 16) & 0xff, (a2 >> 8) & 0xff, a2 & 0xff); - return buffer; -} - -/** - * homa_print_ipv6_addr() - Convert an IPv6 address to a human-readable string - * representation. IPv4-mapped addresses are printed in IPv4 syntax. - * @addr: Address to convert, in network byte order. - * - * Return: The converted value. Values are stored in static memory, so - * the caller need not free. This also means that storage is - * eventually reused (there are enough buffers to accommodate - * multiple "active" values). - */ -char *homa_print_ipv6_addr(const struct in6_addr *addr) -{ -#define NUM_BUFS (1 << 2) -#define BUF_SIZE 64 - static char buffers[NUM_BUFS][BUF_SIZE]; - static int next_buf = 0; - char *buffer = buffers[next_buf]; - next_buf++; - if (next_buf >= NUM_BUFS) - next_buf = 0; #ifdef __UNIT_TEST__ - struct in6_addr zero = {}; - if (ipv6_addr_equal(addr, &zero)) { - snprintf(buffer, BUF_SIZE, "0.0.0.0"); - } else if ((addr->s6_addr32[0] == 0) && - (addr->s6_addr32[1] == 0) && - (addr->s6_addr32[2] == htonl(0x0000ffff))) { - __u32 a2 = ntohl(addr->s6_addr32[3]); - snprintf(buffer, BUF_SIZE, "%u.%u.%u.%u", (a2 >> 24) & 0xff, - (a2 >> 16) & 0xff, (a2 >> 8) & 0xff, a2 & 0xff); - } else { - const char *inet_ntop(int, const void *, char *, size_t); - inet_ntop(AF_INET6, addr, buffer + 1, BUF_SIZE); - buffer[0] = '['; - strcat(buffer, "]"); - } -#else - snprintf(buffer, BUF_SIZE, "%pI6", addr); -#endif - return buffer; -} - -/** - * homa_print_packet() - Print a human-readable string describing the - * information in a Homa packet. - * @skb: Packet whose information should be printed. - * @buffer: Buffer in which to generate the string. - * @buf_len: Number of bytes available at @buffer. - * - * Return: @buffer - */ -char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) -{ - int used = 0; - struct common_header *common; - struct in6_addr saddr; - - if (skb == NULL) { - snprintf(buffer, buf_len, "skb is NULL!"); - buffer[buf_len-1] = 0; - return buffer; - } +#include "utils.h" + unit_homa_destroy(homa); +#endif /* __UNIT_TEST__ */ - common = (struct common_header *) skb->data; - saddr = skb_canonical_ipv6_saddr(skb); - used = homa_snprintf(buffer, buf_len, used, - "%s from %s:%u, dport %d, id %llu", - homa_symbol_for_type(common->type), - homa_print_ipv6_addr(&saddr), - ntohs(common->sport), ntohs(common->dport), - be64_to_cpu(common->sender_id)); - switch (common->type) { - case DATA: { - struct data_header *h = (struct data_header *) - skb->data; - struct data_segment *seg; - int seg_length = ntohl(h->seg.segment_length); - int bytes_left, i; - used = homa_snprintf(buffer, buf_len, used, - ", message_length %d, offset %d, " - "data_length %d, incoming %d", - ntohl(h->message_length), - ntohl(h->seg.offset), seg_length, - ntohl(h->incoming)); - if (ntohs(h->cutoff_version != 0)) - used = homa_snprintf(buffer, buf_len, used, - ", cutoff_version %d", - ntohs(h->cutoff_version)); - if (h->retransmit) - used = homa_snprintf(buffer, buf_len, used, - ", RETRANSMIT"); - if (skb_shinfo(skb)->gso_type == 0xd) - used = homa_snprintf(buffer, buf_len, used, - ", TSO disabled"); - bytes_left = skb->len - sizeof32(*h) - seg_length; - if (skb_shinfo(skb)->gso_segs <= 1) - break; - used = homa_snprintf(buffer, buf_len, used, ", extra segs"); - for (i = skb_shinfo(skb)->gso_segs - 1; i > 0; i--) { - seg = (struct data_segment *) (skb->data + skb->len - - bytes_left); - seg_length = ntohl(seg->segment_length); - used = homa_snprintf(buffer, buf_len, used, - " %d@%d", seg_length, - ntohl(seg->offset)); - bytes_left -= sizeof32(*seg) + seg_length; - }; - break; - } - case GRANT: { - struct grant_header *h = (struct grant_header *) skb->data; - used = homa_snprintf(buffer, buf_len, used, - ", offset %d, grant_prio %u", - ntohl(h->offset), h->priority); - break; + /* The order of the following cleanups matters! */ + if (homa->socktab) { + homa_socktab_destroy(homa->socktab, NULL); + kfree(homa->socktab); + homa->socktab = NULL; } - case RESEND: { - struct resend_header *h = (struct resend_header *) skb->data; - used = homa_snprintf(buffer, buf_len, used, - ", offset %d, length %d, resend_prio %u", - ntohl(h->offset), ntohl(h->length), - h->priority); - break; +#ifndef __STRIP__ /* See strip.py */ + if (homa->grant) { + homa_grant_free(homa->grant); + homa->grant = NULL; } - case UNKNOWN: - /* Nothing to add here. */ - break; - case BUSY: - /* Nothing to add here. */ - break; - case CUTOFFS: { - struct cutoffs_header *h = (struct cutoffs_header *) skb->data; - used = homa_snprintf(buffer, buf_len, used, - ", cutoffs %d %d %d %d %d %d %d %d, version %u", - ntohl(h->unsched_cutoffs[0]), - ntohl(h->unsched_cutoffs[1]), - ntohl(h->unsched_cutoffs[2]), - ntohl(h->unsched_cutoffs[3]), - ntohl(h->unsched_cutoffs[4]), - ntohl(h->unsched_cutoffs[5]), - ntohl(h->unsched_cutoffs[6]), - ntohl(h->unsched_cutoffs[7]), - ntohs(h->cutoff_version)); - break; + if (homa->pacer) { + homa_pacer_free(homa->pacer); + homa->pacer = NULL; } - case FREEZE: - /* Nothing to add here. */ - break; - case NEED_ACK: - /* Nothing to add here. */ - break; - case ACK: { - struct ack_header *h = (struct ack_header *) skb->data; - int i, count; - count = ntohs(h->num_acks); - used = homa_snprintf(buffer, buf_len, used, ", acks"); - for (i = 0; i < count; i++) { - used = homa_snprintf(buffer, buf_len, used, - " [cp %d, sp %d, id %llu]", - ntohs(h->acks[i].client_port), - ntohs(h->acks[i].server_port), - be64_to_cpu(h->acks[i].client_id)); - } - break; + if (homa->qshared) { + homa_qdisc_shared_free(homa->qshared); + homa->qshared = NULL; } +#endif /* See strip.py */ + if (homa->peertab) { + homa_peer_free_peertab(homa->peertab); + homa->peertab = NULL; } +#ifndef __STRIP__ /* See strip.py */ - buffer[buf_len-1] = 0; - return buffer; + homa_skb_cleanup(homa); +#endif /* See strip.py */ } /** - * homa_print_packet_short() - Print a human-readable string describing the - * information in a Homa packet. This function generates a shorter - * description than homa_print_packet. - * @skb: Packet whose information should be printed. - * @buffer: Buffer in which to generate the string. - * @buf_len: Number of bytes available at @buffer. - * - * Return: @buffer + * homa_net_init() - Initialize a new struct homa_net as a per-net subsystem. + * @hnet: Struct to initialzie. + * @net: The network namespace the struct will be associated with. + * @homa: The main Homa data structure to use for the net. + * Return: 0 on success, otherwise a negative errno. */ -char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) +int homa_net_init(struct homa_net *hnet, struct net *net, struct homa *homa) { - struct common_header *common = - (struct common_header *) skb_transport_header(skb); - switch (common->type) { - case DATA: { - struct data_header *h = (struct data_header *) common; - struct data_segment *seg; - int bytes_left, used, i; - int seg_length = ntohl(h->seg.segment_length); - - used = homa_snprintf(buffer, buf_len, 0, "DATA%s %d@%d", - h->retransmit ? " retrans" : "", - seg_length, ntohl(h->seg.offset)); - bytes_left = skb->len - sizeof32(*h) - seg_length; - for (i = skb_shinfo(skb)->gso_segs - 1; i > 0; i--) { - seg = (struct data_segment *) (skb->data + skb->len - - bytes_left); - seg_length = ntohl(seg->segment_length); - used = homa_snprintf(buffer, buf_len, used, - " %d@%d", seg_length, - ntohl(seg->offset)); - bytes_left -= sizeof32(*seg) + seg_length; - } - break; - } - case GRANT: { - struct grant_header *h = (struct grant_header *) common; - snprintf(buffer, buf_len, "GRANT %d@%d", ntohl(h->offset), - h->priority); - break; - } - case RESEND: { - struct resend_header *h = (struct resend_header *) common; - snprintf(buffer, buf_len, "RESEND %d-%d@%d", ntohl(h->offset), - ntohl(h->offset) + ntohl(h->length) - 1, - h->priority); - break; - } - case UNKNOWN: - snprintf(buffer, buf_len, "UNKNOWN"); - break; - case BUSY: - snprintf(buffer, buf_len, "BUSY"); - break; - case CUTOFFS: - snprintf(buffer, buf_len, "CUTOFFS"); - break; - case FREEZE: - snprintf(buffer, buf_len, "FREEZE"); - break; - case NEED_ACK: - snprintf(buffer, buf_len, "NEED_ACK"); - break; - break; - case ACK: - snprintf(buffer, buf_len, "ACK"); - break; - default: - snprintf(buffer, buf_len, "unknown packet type 0x%x", - common->type); - break; - } - return buffer; -} - -/** - * homa_snprintf() - This function makes it easy to use a series of calls - * to snprintf to gradually append information to a fixed-size buffer. - * If the buffer fills, the function can continue to be called, but nothing - * more will get added to the buffer. - * @buffer: Characters accumulate here. - * @size: Total space available in @buffer. - * @used: Number of bytes currently occupied in the buffer, not including - * a terminating null character; this is typically the result of - * the previous call to this function. - * @format: Format string suitable for passing to printf-like functions, - * followed by values for the various substitutions requested - * in @format - * @ ... - * - * Return: The number of characters now occupied in @buffer, not - * including the terminating null character. - */ -int homa_snprintf(char *buffer, int size, int used, const char* format, ...) -{ - int new_chars; - - va_list ap; - va_start(ap, format); - - if (used >= (size-1)) - return used; - - new_chars = vsnprintf(buffer + used, size - used, format, ap); - if (new_chars < 0) - return used; - if (new_chars >= (size - used)) - return size - 1; - return used + new_chars; -} - -/** - * homa_symbol_for_state() - Returns a printable string describing an - * RPC state. - * @rpc: RPC whose state should be returned in printable form. - * - * Return: A static string holding the current state of @rpc. - */ -char *homa_symbol_for_state(struct homa_rpc *rpc) -{ - static char buffer[20]; - switch (rpc->state) { - case RPC_OUTGOING: - return "OUTGOING"; - case RPC_INCOMING: - return "INCOMING"; - case RPC_IN_SERVICE: - return "IN_SERVICE"; - case RPC_DEAD: - return "DEAD"; - } - - /* See safety comment in homa_symbol_for_type. */ - snprintf(buffer, sizeof(buffer)-1, "unknown(%u)", rpc->state); - buffer[sizeof(buffer)-1] = 0; - return buffer; -} - -/** - * homa_symbol_for_type() - Returns a printable string describing a packet type. - * @type: A value from those defined by &homa_packet_type. - * - * Return: A static string holding the packet type corresponding to @type. - */ -char *homa_symbol_for_type(uint8_t type) -{ - static char buffer[20]; - switch (type) { - case DATA: - return "DATA"; - case GRANT: - return "GRANT"; - case RESEND: - return "RESEND"; - case UNKNOWN: - return "UNKNOWN"; - case BUSY: - return "BUSY"; - case CUTOFFS: - return "CUTOFFS"; - case FREEZE: - return "FREEZE"; - case NEED_ACK: - return "NEED_ACK"; - case ACK: - return "ACK"; - } - - /* Using a static buffer can produce garbled text under concurrency, - * but (a) it's unlikely (this code only executes if the opcode is - * bogus), (b) this is mostly for testing and debugging, and (c) the - * code below ensures that the string cannot run past the end of the - * buffer, so the code is safe. */ - snprintf(buffer, sizeof(buffer)-1, "unknown(%u)", type); - buffer[sizeof(buffer)-1] = 0; - return buffer; -} - -/** - * homa_append_metric() - Formats a new metric and appends it to homa->metrics. - * @homa: The new data will appended to the @metrics field of - * this structure. - * @format: Standard printf-style format string describing the - * new metric. Arguments after this provide the usual - * values expected for printf-like functions. - */ -void homa_append_metric(struct homa *homa, const char* format, ...) -{ - char *new_buffer; - size_t new_chars; - va_list ap; - - if (!homa->metrics) { -#ifdef __UNIT_TEST__ - homa->metrics_capacity = 30; -#else - homa->metrics_capacity = 4096; -#endif - homa->metrics = kmalloc(homa->metrics_capacity, GFP_KERNEL); - if (!homa->metrics) { - printk(KERN_WARNING "homa_append_metric couldn't " - "allocate memory\n"); - return; - } - homa->metrics_length = 0; - } - - /* May have to execute this loop multiple times if we run out - * of space in homa->metrics; each iteration expands the storage, - * until eventually it is large enough. - */ - while (true) { - va_start(ap, format); - new_chars = vsnprintf(homa->metrics + homa->metrics_length, - homa->metrics_capacity - homa->metrics_length, - format, ap); - va_end(ap); - if ((homa->metrics_length + new_chars) < homa->metrics_capacity) - break; - - /* Not enough room; expand buffer capacity. */ - homa->metrics_capacity *= 2; - new_buffer = kmalloc(homa->metrics_capacity, GFP_KERNEL); - if (!new_buffer) { - printk(KERN_WARNING "homa_append_metric couldn't " - "allocate memory\n"); - return; - } - memcpy(new_buffer, homa->metrics, homa->metrics_length); - kfree(homa->metrics); - homa->metrics = new_buffer; - } - homa->metrics_length += new_chars; + memset(hnet, 0, sizeof(*hnet)); + hnet->homa = homa; + hnet->prev_default_port = HOMA_MIN_DEFAULT_PORT - 1; + return 0; } /** - * homa_print_metrics() - Sample all of the Homa performance metrics and - * generate a human-readable string describing all of them. - * @homa: Overall data about the Homa protocol implementation; - * the formatted string will be stored in homa->metrics. - * - * Return: The formatted string. + * homa_net_destroy() - Release any resources associated with a homa_net. + * @hnet: Object to destroy; must not be used again after this function + * returns. */ -char *homa_print_metrics(struct homa *homa) +void homa_net_destroy(struct homa_net *hnet) { - int core, i, lower = 0; - - homa->metrics_length = 0; - homa_append_metric(homa, - "rdtsc_cycles %20llu " - "RDTSC cycle counter when metrics were gathered\n", - get_cycles()); - homa_append_metric(homa, - "cpu_khz %15llu " - "Clock rate for RDTSC counter, in khz\n", - cpu_khz); - for (core = 0; core < nr_cpu_ids; core++) { - struct homa_metrics *m = &homa_cores[core]->metrics; - homa_append_metric(homa, - "core %15d " - "Core id for following metrics\n", - core); - for (i = 0; i < HOMA_NUM_SMALL_COUNTS; i++) { - homa_append_metric(homa, - "msg_bytes_%-9d %15llu " - "Bytes in incoming messages containing " - "%d-%d bytes\n", - (i+1)*64, m->small_msg_bytes[i], lower, - (i+1)*64); - lower = (i+1)*64 + 1; - } - for (i = (HOMA_NUM_SMALL_COUNTS*64)/1024; - i < HOMA_NUM_MEDIUM_COUNTS; i++) { - homa_append_metric(homa, - "msg_bytes_%-9d %15llu " - "Bytes in incoming messages containing " - "%d-%d bytes\n", - (i+1)*1024, m->medium_msg_bytes[i], lower, - (i+1)*1024); - lower = (i+1)*1024 + 1; - } - homa_append_metric(homa, - "large_msg_count %15llu " - "# of incoming messages >= %d bytes\n", - m->large_msg_count, lower); - homa_append_metric(homa, - "large_msg_bytes %15llu " - "Bytes in incoming messages >= %d bytes\n", - m->large_msg_bytes, lower); - homa_append_metric(homa, - "sent_msg_bytes %15llu " - "Total bytes in all outgoing messages\n", - m->sent_msg_bytes); - for (i = DATA; i < BOGUS; i++) { - char *symbol = homa_symbol_for_type(i); - homa_append_metric(homa, - "packets_sent_%-7s %15llu " - "%s packets sent\n", - symbol, m->packets_sent[i-DATA], - symbol); - } - for (i = DATA; i < BOGUS; i++) { - char *symbol = homa_symbol_for_type(i); - homa_append_metric(homa, - "packets_rcvd_%-7s %15llu " - "%s packets received\n", - symbol, m->packets_received[i-DATA], - symbol); - } - for (i = 0; i < HOMA_MAX_PRIORITIES; i++) { - homa_append_metric(homa, - "priority%d_bytes %15llu " - "Bytes sent at priority %d " - "(including headers)\n", - i, m->priority_bytes[i], i); - } - for (i = 0; i < HOMA_MAX_PRIORITIES; i++) { - homa_append_metric(homa, - "priority%d_packets %15llu " - "Packets sent at priority %d\n", - i, m->priority_packets[i], i); - } - homa_append_metric(homa, - "requests_received %15llu " - "Incoming request messages\n", - m->requests_received); - homa_append_metric(homa, - "requests_queued %15llu " - "Requests for which no thread was waiting\n", - m->requests_queued); - homa_append_metric(homa, - "responses_received %15llu " - "Incoming response messages\n", - m->responses_received); - homa_append_metric(homa, - "responses_queued %15llu " - "Responses for which no thread was waiting\n", - m->responses_queued); - homa_append_metric(homa, - "fast_wakeups %15llu " - "Messages received while polling\n", - m->fast_wakeups); - homa_append_metric(homa, - "slow_wakeups %15llu " - "Messages received after thread went to sleep\n", - m->slow_wakeups); - homa_append_metric(homa, - "poll_cycles %15llu " - "Time spent polling for incoming messages\n", - m->poll_cycles); - homa_append_metric(homa, - "softirq_calls %15llu " - "Calls to homa_softirq (i.e. # GRO pkts " - "received)\n", - m->softirq_calls); - homa_append_metric(homa, - "softirq_cycles %15llu " - "Time spent in homa_softirq during SoftIRQ\n", - m->softirq_cycles); - homa_append_metric(homa, - "bypass_softirq_cycles %15llu " - "Time spent in homa_softirq during bypass " - "from GRO\n", - m->bypass_softirq_cycles); - homa_append_metric(homa, - "linux_softirq_cycles %15llu " - "Time spent in all Linux SoftIRQ\n", - m->linux_softirq_cycles); - homa_append_metric(homa, - "napi_cycles %15llu " - "Time spent in NAPI-level packet handling\n", - m->napi_cycles); - homa_append_metric(homa, - "send_cycles %15llu " - "Time spent in homa_ioc_send kernel call\n", - m->send_cycles); - homa_append_metric(homa, - "send_calls %15llu " - "Total invocations of send kernel call\n", - m->send_calls); - homa_append_metric(homa, - "recv_cycles %15llu " - "Unblocked time spent in recvmsg kernel call\n", - m->recv_cycles - m->blocked_cycles); - homa_append_metric(homa, - "recv_calls %15llu " - "Total invocations of recvmsg kernel call\n", - m->recv_calls); - homa_append_metric(homa, - "blocked_cycles %15llu " - "Time spent blocked in homa_recvmsg\n", - m->blocked_cycles); - homa_append_metric(homa, - "reply_cycles %15llu " - "Time spent in homa_ioc_reply kernel call\n", - m->reply_cycles); - homa_append_metric(homa, - "reply_calls %15llu " - "Total invocations of reply kernel call\n", - m->reply_calls); - homa_append_metric(homa, - "abort_cycles %15llu " - "Time spent in homa_ioc_abort kernel call\n", - m->reply_cycles); - homa_append_metric(homa, - "abort_calls %15llu " - "Total invocations of abort kernel call\n", - m->reply_calls); - homa_append_metric(homa, - "so_set_buf_cycles %15llu " - "Time spent in setsockopt SO_HOMA_SET_BUF\n", - m->so_set_buf_cycles); - homa_append_metric(homa, - "so_set_buf_calls %15llu " - "Total invocations of setsockopt SO_HOMA_SET_BUF\n", - m->so_set_buf_calls); - homa_append_metric(homa, - "grant_cycles %15llu " - "Time spent sending grants\n", - m->grant_cycles); - homa_append_metric(homa, - "timer_cycles %15llu " - "Time spent in homa_timer\n", - m->timer_cycles); - homa_append_metric(homa, - "timer_reap_cycles %15llu " - "Time in homa_timer spent reaping RPCs\n", - m->timer_reap_cycles); - homa_append_metric(homa, - "data_pkt_reap_cycles %15llu " - "Time in homa_data_pkt spent reaping RPCs\n", - m->data_pkt_reap_cycles); - homa_append_metric(homa, - "pacer_cycles %15llu " - "Time spent in homa_pacer_main\n", - m->pacer_cycles); - homa_append_metric(homa, - "homa_cycles %15llu " - "Total time in all Homa-related functions\n", - m->softirq_cycles + m->napi_cycles + - m->send_cycles + m->recv_cycles + - m->reply_cycles - m->blocked_cycles + - m->timer_cycles + m->pacer_cycles); - homa_append_metric(homa, - "pacer_lost_cycles %15llu " - "Lost transmission time because pacer was " - "slow\n", - m->pacer_lost_cycles); - homa_append_metric(homa, - "pacer_bytes %15llu " - "Bytes transmitted when the pacer was active\n", - m->pacer_bytes); - homa_append_metric(homa, - "pacer_skipped_rpcs %15llu " - "Pacer aborts because of locked RPCs\n", - m->pacer_skipped_rpcs); - homa_append_metric(homa, - "pacer_needed_help %15llu " - "homa_pacer_xmit invocations from " - "homa_check_pacer\n", - m->pacer_needed_help); - homa_append_metric(homa, - "throttled_cycles %15llu " - "Time when the throttled queue was nonempty\n", - m->throttled_cycles); - homa_append_metric(homa, - "resent_packets %15llu " - "DATA packets sent in response to RESENDs\n", - m->resent_packets); - homa_append_metric(homa, - "peer_hash_links %15llu " - "Hash chain link traversals in peer table\n", - m->peer_hash_links); - homa_append_metric(homa, - "peer_new_entries %15llu " - "New entries created in peer table\n", - m->peer_new_entries); - homa_append_metric(homa, - "peer_kmalloc_errors %15llu " - "kmalloc failures creating peer table " - "entries\n", - m->peer_kmalloc_errors); - homa_append_metric(homa, - "peer_route_errors %15llu " - "Routing failures creating peer table " - "entries\n", - m->peer_route_errors); - homa_append_metric(homa, - "control_xmit_errors %15llu " - "Errors sending control packets\n", - m->control_xmit_errors); - homa_append_metric(homa, - "data_xmit_errors %15llu " - "Errors sending data packets\n", - m->data_xmit_errors); - homa_append_metric(homa, - "unknown_rpcs %15llu " - "Non-grant packets discarded because RPC unknown\n", - m->unknown_rpcs); - homa_append_metric(homa, - "server_cant_create_rpcs %15llu " - "Packets discarded because server " - "couldn't create RPC\n", - m->server_cant_create_rpcs); - homa_append_metric(homa, - "unknown_packet_types %15llu " - "Packets discarded because of unsupported " - "type\n", - m->unknown_packet_types); - homa_append_metric(homa, - "short_packets %15llu " - "Packets discarded because too short\n", - m->short_packets); - homa_append_metric(homa, - "redundant_packets %15llu " - "Packets discarded because data already " - "received\n", - m->redundant_packets); - homa_append_metric(homa, - "resent_packets_used %15llu " - "Retransmitted packets that were actually " - "needed\n", - m->resent_packets_used); - homa_append_metric(homa, - "peer_timeouts %15llu " - "Peers found to be nonresponsive\n", - m->peer_timeouts); - homa_append_metric(homa, - "server_rpc_discards %15llu " - "RPCs aborted by server because of timeouts\n", - m->server_rpc_discards); - homa_append_metric(homa, - "server_rpcs_unknown %15llu " - "RPCs aborted by server because unknown to " - "client\n", - m->server_rpcs_unknown); - homa_append_metric(homa, - "client_lock_misses %15llu " - "Bucket lock misses for client RPCs\n", - m->client_lock_misses); - homa_append_metric(homa, - "client_lock_miss_cycles %15llu " - "Time lost waiting for client bucket locks\n", - m->client_lock_miss_cycles); - homa_append_metric(homa, - "server_lock_misses %15llu " - "Bucket lock misses for server RPCs\n", - m->server_lock_misses); - homa_append_metric(homa, - "server_lock_miss_cycles %15llu " - "Time lost waiting for server bucket locks\n", - m->server_lock_miss_cycles); - homa_append_metric(homa, - "socket_lock_misses %15llu " - "Socket lock misses\n", - m->socket_lock_misses); - homa_append_metric(homa, - "socket_lock_miss_cycles %15llu " - "Time lost waiting for socket locks\n", - m->socket_lock_miss_cycles); - homa_append_metric(homa, - "throttle_lock_misses %15llu " - "Throttle lock misses\n", - m->throttle_lock_misses); - homa_append_metric(homa, - "throttle_lock_miss_cycles %15llu " - "Time lost waiting for throttle locks\n", - m->throttle_lock_miss_cycles); - homa_append_metric(homa, - "grantable_lock_misses %15llu " - "Grantable lock misses\n", - m->grantable_lock_misses); - homa_append_metric(homa, - "grantable_lock_miss_cycles%15llu " - "Time lost waiting for grantable lock\n", - m->grantable_lock_miss_cycles); - homa_append_metric(homa, - "peer_ack_lock_misses %15llu " - "Misses on peer ack locks\n", - m->peer_ack_lock_misses); - homa_append_metric(homa, - "peer_ack_lock_miss_cycles %15llu " - "Time lost waiting for peer ack locks\n", - m->peer_ack_lock_miss_cycles); - homa_append_metric(homa, - "disabled_reaps %15llu " - "Reaper invocations that were disabled\n", - m->disabled_reaps); - homa_append_metric(homa, - "disabled_rpc_reaps %15llu " - "Disabled RPCs skipped by reaper\n", - m->disabled_rpc_reaps); - homa_append_metric(homa, - "reaper_calls %15llu " - "Reaper invocations that were not disabled\n", - m->reaper_calls); - homa_append_metric(homa, - "reaper_dead_skbs %15llu " - "Sum of hsk->dead_skbs across all reaper " - "calls\n", - m->reaper_dead_skbs); - homa_append_metric(homa, - "forced_reaps %15llu " - "Reaps forced by accumulation of dead RPCs\n", - m->forced_reaps); - homa_append_metric(homa, - "throttle_list_adds %15llu " - "Calls to homa_add_to_throttled\n", - m->throttle_list_adds); - homa_append_metric(homa, - "throttle_list_checks %15llu " - "List elements checked in " - "homa_add_to_throttled\n", - m->throttle_list_checks); - homa_append_metric(homa, - "fifo_grants %15llu " - "Grants issued using FIFO priority\n", - m->fifo_grants); - homa_append_metric(homa, - "fifo_grants_no_incoming %15llu " - "FIFO grants to messages with no " - "outstanding grants\n", - m->fifo_grants_no_incoming); - homa_append_metric(homa, - "ack_overflows %15llu " - "Explicit ACKs sent because peer->acks was " - "full\n", - m->ack_overflows); - homa_append_metric(homa, - "ignored_need_acks %15llu " - "NEED_ACKs ignored because RPC result not " - "yet received\n", - m->ignored_need_acks); - homa_append_metric(homa, - "bpage_reuses %15llu " - "Buffer page could be reused because ref " - "count was zero\n", - m->bpage_reuses); - for (i = 0; i < NUM_TEMP_METRICS; i++) - homa_append_metric(homa, - "temp%-2d %15llu " - "Temporary use in testing\n", - i, m->temp[i]); - } - - return homa->metrics; + homa_socktab_destroy(hnet->homa->socktab, hnet); + homa_peer_free_net(hnet); } +#ifndef __STRIP__ /* See strip.py */ /** * homa_prios_changed() - This function is called whenever configuration * information related to priorities, such as @homa->unsched_cutoffs or @@ -1609,7 +209,7 @@ void homa_prios_changed(struct homa *homa) */ homa->unsched_cutoffs[0] = INT_MAX; - for (i = HOMA_MAX_PRIORITIES-1; ; i--) { + for (i = HOMA_MAX_PRIORITIES - 1; ; i--) { if (i >= homa->num_priorities) { homa->unsched_cutoffs[i] = 0; continue; @@ -1619,92 +219,24 @@ void homa_prios_changed(struct homa *homa) homa->max_sched_prio = 0; break; } - if ((homa->unsched_cutoffs[i] >= HOMA_MAX_MESSAGE_LENGTH)) { - homa->max_sched_prio = i-1; + if (homa->unsched_cutoffs[i] >= HOMA_MAX_MESSAGE_LENGTH) { + homa->max_sched_prio = i - 1; break; } } homa->cutoff_version++; } +#endif /* See strip.py */ /** * homa_spin() - Delay (without sleeping) for a given time interval. - * @usecs: How long to delay (in microseconds) + * @ns: How long to delay (in nanoseconds) */ -void homa_spin(int usecs) +void homa_spin(int ns) { - __u64 end; - end = get_cycles() + (usecs*cpu_khz)/1000; - while (get_cycles() < end) { - /* Empty loop body.*/ - } -} + u64 end; -/** - * homa_free_skbs() - Free all of the skbs in a list. - * @head: First in a list of socket buffers linked through homa_next_skb. - */ -void homa_free_skbs(struct sk_buff *head) -{ - while (head) { - struct sk_buff *next = homa_get_skb_info(head)->next_skb; - kfree_skb(head); - head = next; - } -} - -/** - * homa_grantable_lock_slow() - This function implements the slow path for - * acquiring the grantable lock. It is invoked when the lock isn't immediately - * available. It waits for the lock, but also records statistics about - * the waiting time. - * @homa: Overall data about the Homa protocol implementation. - */ -void homa_grantable_lock_slow(struct homa *homa) -{ - __u64 start = get_cycles(); - tt_record("beginning wait for grantable lock"); - spin_lock_bh(&homa->grantable_lock); - tt_record("ending wait for grantable lock"); - INC_METRIC(grantable_lock_misses, 1); - INC_METRIC(grantable_lock_miss_cycles, get_cycles() - start); -} - -/** - * homa_throttle_lock_slow() - This function implements the slow path for - * acquiring the throttle lock. It is invoked when the lock isn't immediately - * available. It waits for the lock, but also records statistics about - * the waiting time. - * @homa: Overall data about the Homa protocol implementation. - */ -void homa_throttle_lock_slow(struct homa *homa) -{ - __u64 start = get_cycles(); - tt_record("beginning wait for throttle lock"); - spin_lock_bh(&homa->throttle_lock); - tt_record("ending wait for throttle lock"); - INC_METRIC(throttle_lock_misses, 1); - INC_METRIC(throttle_lock_miss_cycles, get_cycles() - start); -} - -/** - * homa_freeze() - Freezes the timetrace if a particular kind of freeze - * has been requested through sysctl. - * @rpc: If we freeze our timetrace, we'll also send a freeze request - * to the peer for this RPC. - * @type: Condition that just occurred. If this doesn't match the - * externally set "freeze_type" value, then we don't freeze. - * @format: Format string used to generate a time trace record describing - * the reason for the freeze; must include "id %d, peer 0x%x" - */ -void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format) -{ - if (type != rpc->hsk->homa->freeze_type) - return; - if (!tt_frozen) { - struct freeze_header freeze; - tt_record2(format, rpc->id, tt_addr(rpc->peer->addr)); - tt_freeze(); - homa_xmit_control(FREEZE, &freeze, sizeof(freeze), rpc); - } + end = homa_clock() + homa_ns_to_cycles(ns); + while (homa_clock() < end) + cpu_relax(); } diff --git a/homa_wire.h b/homa_wire.h new file mode 100644 index 00000000..948be506 --- /dev/null +++ b/homa_wire.h @@ -0,0 +1,577 @@ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ + +/* This file defines the on-the-wire format of Homa packets. */ + +#ifndef _HOMA_WIRE_H +#define _HOMA_WIRE_H + +#include +#ifdef __UNIT_TEST__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" +#endif /* __UNIT_TEST__ */ +#include +#ifdef __UNIT_TEST__ +#pragma GCC diagnostic pop +#endif /* __UNIT_TEST__ */ + +/* Defines the possible types of Homa packets. + * + * See the xxx_header structs below for more information about each type. + */ +enum homa_packet_type { + DATA = 0x10, +#ifndef __STRIP__ /* See strip.py */ + GRANT = 0x11, +#endif /* See strip.py */ + RESEND = 0x12, + RPC_UNKNOWN = 0x13, + BUSY = 0x14, +#ifndef __STRIP__ /* See strip.py */ + CUTOFFS = 0x15, +#endif /* See strip.py */ +#ifndef __UPSTREAM__ /* See strip.py */ + FREEZE = 0x16, +#endif /* See strip.py */ + NEED_ACK = 0x17, + ACK = 0x18, + MAX_OP = 0x18, + /* If you add a new type here, you must also do the following: + * 1. Change MAX_OP so it is the highest valid opcode + * 2. Add support for the new opcode in homa_print_packet, + * homa_print_packet_short, homa_symbol_for_type, and mock_skb_alloc. + * 3. Add the header length to header_lengths in homa_plumbing.c. + */ +}; + +/** + * define HOMA_SKB_EXTRA - How many bytes of additional space to allow at the + * beginning of each sk_buff, before the Homa header. This includes room for + * either an IPV4 or IPV6 header, Ethernet header, VLAN header, etc. This is + * a bit of an overestimate, since it also includes space for a TCP header. + */ +#define HOMA_SKB_EXTRA MAX_TCP_HEADER + +/** + * define HOMA_ETH_FRAME_OVERHEAD - Additional overhead bytes for each + * Ethernet packet that are not included in the packet header (preamble, + * start frame delimiter, CRC, and inter-packet gap). + */ +#define HOMA_ETH_FRAME_OVERHEAD 24 + +/** + * define HOMA_ETH_OVERHEAD - Number of bytes per Ethernet packet for Ethernet + * header, CRC, preamble, and inter-packet gap. + */ +#define HOMA_ETH_OVERHEAD (18 + HOMA_ETH_FRAME_OVERHEAD) + +/** + * define HOMA_MIN_PKT_LENGTH - Every Homa packet must be padded to at least + * this length to meet Ethernet frame size limitations. This number includes + * Homa headers and data, but not IP or Ethernet headers. + */ +#define HOMA_MIN_PKT_LENGTH 26 + +/** + * define HOMA_MAX_HEADER - Number of bytes in the largest Homa header. + */ +#define HOMA_MAX_HEADER 90 + +#ifndef __STRIP__ /* See strip.py */ +/** + * define HOMA_MAX_PRIORITIES - The maximum number of priority levels that + * Homa can use (the actual number can be restricted to less than this at + * runtime). Changing this value will affect packet formats. + */ +#define HOMA_MAX_PRIORITIES 8 +#endif /* See strip.py */ + +/** + * struct homa_common_hdr - Wire format for the first bytes in every Homa + * packet. This must (mostly) match the format of a TCP header to enable + * Homa packets to actually be transmitted as TCP packets (and thereby + * take advantage of TSO and other features). + */ +struct homa_common_hdr { + /** + * @sport: Port on source machine from which packet was sent. + * Must be in the same position as in a TCP header. + */ + __be16 sport; + + /** + * @dport: Port on destination that is to receive packet. Must be + * in the same position as in a TCP header. + */ + __be16 dport; + + /** + * @sequence: corresponds to the sequence number field in TCP headers; + * used in DATA packets to hold the offset in the message of the first + * byte of data. However, when TSO is used without TCP hijacking, this + * value will only be correct in the first segment of a GSO packet. + */ + __be32 sequence; + + /** + * @ack: Corresponds to the high-order bits of the acknowledgment + * field in TCP headers; not used by Homa. + */ + char ack[3]; + + /** + * @type: Homa packet type (one of the values of the homa_packet_type + * enum). Corresponds to the low-order byte of the ack in TCP. + */ + u8 type; + + /** + * @doff: High order 4 bits correspond to the Data Offset field of a + * TCP header. In DATA packets they hold the number of 4-byte chunks + * in a homa_data_hdr; used by TSO to determine where the replicated + * header portion ends. For other packets the offset is always 5 + * (standard TCP header length); other values may cause some NICs + * (such as Intel E810-C) to drop outgoing packets when TCP hijacking + * is enabled. The low-order bits are always 0. + */ + u8 doff; + +#ifndef __STRIP__ /* See strip.py */ + /** + * @flags: Holds TCP flags such as URG, ACK, etc. The special value + * HOMA_TCP_FLAGS is stored here to distinguish Homa-over-TCP packets + * from real TCP packets. It includes the SYN and RST flags, + * which TCP would never use together; must not include URG or FIN + * (TSO will turn off FIN for all but the last segment). + */ + u8 flags; +#define HOMA_TCP_FLAGS 6 +#else /* See strip.py */ + /** @reserved1: Not used (corresponds to TCP flags). */ + u8 reserved1; +#endif /* See strip.py */ + + /** + * @window: Corresponds to the window field in TCP headers. Not used + * by HOMA. + */ + __be16 window; + + /** + * @checksum: Not used by Homa, but must occupy the same bytes as + * the checksum in a TCP header (TSO may modify this?). + */ + __be16 checksum; + +#ifndef __STRIP__ /* See strip.py */ + /** + * @urgent: occupies the same bytes as the urgent pointer in a TCP + * header. When Homa packets are transmitted over TCP, this has the + * special value HOMA_TCP_URGENT (which is set even though URG is + * not set) to indicate that the packet is actually a Homa packet. + */ + __be16 urgent; +#define HOMA_TCP_URGENT 0xb97d +#else /* See strip.py */ + /** @reserved2: Not used (corresponds to TCP urgent field). */ + __be16 reserved2; +#endif /* See strip.py */ + + /** + * @sender_id: the identifier of this RPC as used on the sender (i.e., + * if the low-order bit is set, then the sender is the server for + * this RPC). + */ + __be64 sender_id; +} __packed; + +/** + * struct homa_ack - Identifies an RPC that can be safely deleted by its + * server. After sending the response for an RPC, the server must retain its + * state for the RPC until it knows that the client has successfully + * received the entire response. An ack indicates this. Clients will + * piggyback acks on future data packets, but if a client doesn't send + * any data to the server, the server will eventually request an ack + * explicitly with a NEED_ACK packet, in which case the client will + * return an explicit ACK. + */ +struct homa_ack { + /** + * @client_id: The client's identifier for the RPC. 0 means this ack + * is invalid. + */ + __be64 client_id; + + /** @server_port: The server-side port for the RPC. */ + __be16 server_port; +} __packed; + +#ifndef __STRIP__ /* See strip.py */ +/* struct homa_data_hdr - Contains data for part or all of a Homa message. + * An incoming packet consists of a homa_data_hdr followed by message data. + * An outgoing packet can have this simple format as well, or it can be + * structured as a GSO packet. Homa supports two different formats for GSO + * packets, depending on whether TCP hijacking is enabled: + * + * No hijacking: TCP hijacking: + * + * |-----------------------| |-----------------------| + * | | | | + * | homa_data_hdr | | homa_data_hdr | + * | | | | + * |---------------------- | |-----------------------| + * | | | | + * | | | | + * | segment data | | segment data | + * | | | | + * | | | | + * |-----------------------| |-----------------------| + * | homa_seg_hdr | | | + * |-----------------------| | | + * | | | segment data | + * | | | | + * | segment data | | | + * | | |-----------------------| + * | | | | + * |-----------------------| | | + * | homa_seg_hdr | | segment data | + * |-----------------------| | | + * | | | | + * | | |-----------------------| + * | segment data | + * | | + * | | + * |-----------------------| + * + * With TCP hijacking, TSO will automatically adjust @common.sequence in + * the segments, so that value can be used as the offset of the data within + * the message. Without TCP hijacking, TSO will not adjust @common.sequence + * in the segments, so Homa sprinkles correct offsets (in homa_seg_hdrs) + * throughout the segment data; TSO/GSO will include a different homa_seg_hdr + * in each generated packet. + */ +#else /* See strip.py */ +/* struct homa_data_hdr - Contains data for part or all of a Homa message. + * An incoming packet consists of a homa_data_hdr followed by message data. + * An outgoing packet can have this simple format as well, or it can be + * structured as a GSO packet with the following format: + * + * |-----------------------| + * | | + * | data_header | + * | | + * |---------------------- | + * | | + * | | + * | segment data | + * | | + * | | + * |-----------------------| + * | seg_header | + * |-----------------------| + * | | + * | | + * | segment data | + * | | + * | | + * |-----------------------| + * | seg_header | + * |-----------------------| + * | | + * | | + * | segment data | + * | | + * | | + * |-----------------------| + * + * TSO will not adjust @homa_common_hdr.sequence in the segments, so Homa + * sprinkles correct offsets (in homa_seg_hdrs) throughout the segment data; + * TSO/GSO will include a different homa_seg_hdr in each generated packet. + */ +#endif /* See strip.py */ + +struct homa_seg_hdr { +#ifndef __STRIP__ /* See strip.py */ + /** + * @offset: Offset within message of the first byte of data in + * this segment. If this field is -1 it means that the packet was + * generated by GSO with TCP hijacking. In this case the true offset + * is in @common.sequence. homa_gro_receive detects this situation + * and updates this value from @common.sequence if needed, so the + * value will always be valid once the packet reaches homa_softirq. + */ +#else /* See strip.py */ + /** + * @offset: Offset within message of the first byte of data in + * this segment. + */ +#endif /* See strip.py */ + __be32 offset; +} __packed; + +struct homa_data_hdr { + struct homa_common_hdr common; + + /** @message_length: Total #bytes in the message. */ + __be32 message_length; + +#ifndef __STRIP__ /* See strip.py */ + /** + * @incoming: The receiver can expect the sender to send all of the + * bytes in the message up to at least this offset (exclusive), + * even without additional grants. This includes unscheduled + * bytes, granted bytes, plus any additional bytes the sender + * transmits unilaterally (e.g., to round up to a full GSO batch). + */ + __be32 incoming; +#else /* See strip.py */ + __be32 reserved1; +#endif /* See strip.py */ + + /** @ack: If the @client_id field of this is nonzero, provides info + * about an RPC that the recipient can now safely free. Note: in + * TSO packets this will get duplicated in each of the segments; + * in order to avoid repeated attempts to ack the same RPC, + * homa_gro_receive will clear this field in all segments but the + * first. + */ + struct homa_ack ack; + +#ifndef __STRIP__ /* See strip.py */ + /** + * @cutoff_version: The cutoff_version from the most recent + * CUTOFFS packet that the source of this packet has received + * from the destination of this packet, or 0 if the source hasn't + * yet received a CUTOFFS packet. + */ + __be16 cutoff_version; +#else /* See strip.py */ + __be16 reserved2; +#endif /* See strip.py */ + + /** + * @retransmit: 1 means this packet was sent in response to a RESEND + * (it has already been sent previously). + */ + u8 retransmit; + + char pad[3]; + + /** @seg: First of possibly many segments. */ + struct homa_seg_hdr seg; +} __packed; + +/** + * homa_data_len() - Returns the total number of bytes in a DATA packet + * after the homa_data_hdr. Note: if the packet is a GSO packet, the result + * may include metadata as well as packet data. + * @skb: Incoming data packet + * Return: see above + */ +static inline int homa_data_len(struct sk_buff *skb) +{ + return skb->len - skb_transport_offset(skb) - + sizeof(struct homa_data_hdr); +} + +#ifndef __STRIP__ /* See strip.py */ +/** + * struct homa_grant_hdr - Wire format for GRANT packets, which are sent by + * the receiver back to the sender to indicate that the sender may transmit + * additional bytes in the message. + */ +struct homa_grant_hdr { + /** @common: Fields common to all packet types. */ + struct homa_common_hdr common; + + /** + * @offset: Byte offset within the message. + * + * The sender should now transmit all data up to (but not including) + * this offset ASAP, if it hasn't already. + */ + __be32 offset; + + /** + * @priority: The sender should use this priority level for all future + * MESSAGE_FRAG packets for this message, until a GRANT is received + * with higher offset. Larger numbers indicate higher priorities. + */ + u8 priority; +} __packed; +#endif /* See strip.py */ + +/** + * struct homa_resend_hdr - Wire format for RESEND packets. + * + * A RESEND is sent by the receiver when it believes that message data may + * have been lost in transmission (or if it is concerned that the sender may + * have crashed). The receiver should resend the specified portion of the + * message, even if it already sent it previously. + */ +struct homa_resend_hdr { + /** @common: Fields common to all packet types. */ + struct homa_common_hdr common; + + /** + * @offset: Offset within the message of the first byte of data that + * should be retransmitted. + */ + __be32 offset; + + /** + * @length: Number of bytes of data to retransmit. -1 means no data + * has been received for the message, so everything sent previously + * should be retransmitted. + */ + __be32 length; + +#ifndef __STRIP__ /* See strip.py */ + /** + * @priority: Packet priority to use. + * + * The sender should transmit all the requested data using this + * priority. + */ + u8 priority; +#endif /* See strip.py */ +} __packed; + +/** + * struct homa_rpc_unknown_hdr - Wire format for RPC_UNKNOWN packets. + * + * An RPC_UNKNOWN packet is sent by either server or client when it receives a + * packet for an RPC that is unknown to it. When a client receives an + * RPC_UNKNOWN packet it will typically restart the RPC from the beginning; + * when a server receives an RPC_UNKNOWN packet it will typically discard its + * state for the RPC. + */ +struct homa_rpc_unknown_hdr { + /** @common: Fields common to all packet types. */ + struct homa_common_hdr common; +} __packed; + +/** + * struct homa_busy_hdr - Wire format for BUSY packets. + * + * These packets tell the recipient that the sender is still alive (even if + * it isn't sending data expected by the recipient). + */ +struct homa_busy_hdr { + /** @common: Fields common to all packet types. */ + struct homa_common_hdr common; +} __packed; + +#ifndef __STRIP__ /* See strip.py */ +/** + * struct homa_cutoffs_hdr - Wire format for CUTOFFS packets. + * + * These packets tell the recipient how to assign priorities to + * unscheduled packets. + */ +struct homa_cutoffs_hdr { + /** @common: Fields common to all packet types. */ + struct homa_common_hdr common; + + /** + * @unsched_cutoffs: priorities to use for unscheduled packets + * sent to the sender of this packet. See documentation for + * @homa.unsched_cutoffs for the meanings of these values. + */ + __be32 unsched_cutoffs[HOMA_MAX_PRIORITIES]; + + /** + * @cutoff_version: unique identifier associated with @unsched_cutoffs. + * Must be included in future DATA packets sent to the sender of + * this packet. + */ + __be16 cutoff_version; +} __packed; +#endif /* See strip.py */ + +#ifndef __UPSTREAM__ /* See strip.py */ +/** + * struct homa_freeze_hdr - Wire format for FREEZE packets. + * + * These packets tell the recipient to freeze its timetrace; used + * for debugging. + */ +struct homa_freeze_hdr { + /** @common: Fields common to all packet types. */ + struct homa_common_hdr common; +} __packed; +#endif /* See strip.py */ + +/** + * struct homa_need_ack_hdr - Wire format for NEED_ACK packets. + * + * These packets ask the recipient (a client) to return an ACK message if + * the packet's RPC is no longer active. + */ +struct homa_need_ack_hdr { + /** @common: Fields common to all packet types. */ + struct homa_common_hdr common; +} __packed; + +/** + * struct homa_ack_hdr - Wire format for ACK packets. + * + * These packets are sent from a client to a server to indicate that + * a set of RPCs is no longer active on the client, so the server can + * free any state it may have for them. + */ +struct homa_ack_hdr { + /** @common: Fields common to all packet types. */ + struct homa_common_hdr common; + + /** @num_acks: Number of (leading) elements in @acks that are valid. */ + __be16 num_acks; + +#define HOMA_MAX_ACKS_PER_PKT 5 + /** @acks: Info about RPCs that are no longer active. */ + struct homa_ack acks[HOMA_MAX_ACKS_PER_PKT]; +} __packed; + +/** + * homa_local_id(): given an RPC identifier from an input packet (which + * is network-encoded), return the decoded id we should use for that + * RPC on this machine. + * @sender_id: RPC id from an incoming packet, such as h->common.sender_id + * Return: see above + */ +static inline u64 homa_local_id(__be64 sender_id) +{ + /* If the client bit was set on the sender side, it needs to be + * removed here, and conversely. + */ + return be64_to_cpu(sender_id) ^ 1; +} + +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_set_hijack() - Set fields in a Homa header that are needed for + * TCP hijacking to work properly. + * @common: Header in which to set fields. + */ +static inline void homa_set_hijack(struct homa_common_hdr *common) +{ + common->flags = HOMA_TCP_FLAGS; + common->urgent = htons(HOMA_TCP_URGENT); + common->doff = 0x50; +} + +/** + * homa_get_offset() - Returns the offset within message of the first byte + * of data in a Homa DATA packet (the offset is stored in different places + * in different situations). + * @h: Header for DATA packet + * Return: See above + */ +static inline int homa_get_offset(struct homa_data_hdr *h) +{ + return (h->seg.offset != -1) ? ntohl(h->seg.offset) : + ntohl(h->common.sequence); +} +#endif /* See strip.py */ + +#endif /* _HOMA_WIRE_H */ diff --git a/man/Makefile b/man/Makefile index aa829ec2..c54a7281 100644 --- a/man/Makefile +++ b/man/Makefile @@ -1,9 +1,6 @@ # Makefile to build man pages for Homa. SRCS := homa.7 \ - homa_abort.3 \ - homa_reply.3 \ - homa_send.3 \ recvmsg.2 \ sendmsg.2 @@ -15,14 +12,24 @@ all: $(PDFS) clean: rm -f *.pdf +# Note: in the rules below, it doesn't seem to work to eliminate the +# temporary file and use ps2pdf in a pipeline; as of 12/2024, under +# Cygwin, this produces blank output for some man pages under some +# conditions. %.pdf: %.2 - pdfroff -man $< > $@ + groff -man -Tps $< > tmp.ps + ps2pdf tmp.ps $@ + rm tmp.ps %.pdf: %.3 - pdfroff -man $< > $@ + groff -man -Tps $< > tmp.ps + ps2pdf tmp.ps $@ + rm tmp.ps %.pdf: %.7 - pdfroff -man $< > $@ + groff -man -Tps $< > tmp.ps + ps2pdf tmp.ps $@ + rm tmp.ps # The following target is useful for debugging Makefiles; it # prints the value of a make variable. diff --git a/man/homa.7 b/man/homa.7 index a2b285fe..f1a3c317 100644 --- a/man/homa.7 +++ b/man/homa.7 @@ -1,4 +1,4 @@ -.TH HOMA 7 2022-12-13 "Homa" "Linux Programmer's Manual" +.TH HOMA 7 2024-12-4 "Homa" "Linux Programmer's Manual" .SH NAME homa \- Homa transport protocol .SH SYNOPSIS @@ -60,7 +60,7 @@ A request fails only if Homa cannot maintain communication with the Homa transport module on the server. Homa ensures at-most-once semantics for an RPC. .PP -Home is intended for use between machines that are physically +Homa is intended for use between machines that are physically close, with round-trip latencies no more than a few tens of microseconds. Homa is not suitable for wide-area communication. .PP @@ -147,7 +147,7 @@ is complete. Buffering must be set up by invoking .B setsockopt with the -.BR SO_HOMA_SET_BUF +.BR SO_HOMA_RCVBUF option. This call should be made exactly once per socket, before the first call to .BR recvmsg . @@ -167,8 +167,8 @@ arguments must refer to a struct of the following type: .ps -1 .vs -2 .EX -struct homa_set_buf_args { - void *start; +struct homa_rcvbuf_args { + __u64 start; size_t length; }; .EE @@ -192,6 +192,10 @@ then .I recvmsg calls on the socket will return ENOMEM errors. +.PP +Because of this mechanism, a Homa socket cannot be shared by multiple +processes unless the processes also share the buffer space and map +it to the same virtual address in each sharing process. .SH SENDING MESSAGES .PP The @@ -200,19 +204,6 @@ system call can be used to send request and response messages; see Homa's .BR sendmsg (2) man page for details. -In addition, Homa provides library functions -.BR homa_send , -.BR homa_sendv , -.BR homa_reply , -and -.BR homa_replyv , -which are layered on top of -.BR sendmsg . -See the man pages -.BR homa_send (3) -and -.BR homa_reply (3) -for details on these functions. .SH RECEIVING MESSAGES .PP The @@ -220,12 +211,269 @@ The system call is used to receive messages; see Homa's .BR recvmsg (2) man page for details. -.SH ABORTING REQUESTS .PP -It is possible to abort RPCs that are in progress. This is done with -the -.B homa_abort -function call, which is described in a separate manual page. +By default, if +.B bind +has not been invoked for a socket then it can be used only as the client +for outgoing RPCs: incoming requests directed at the socket will be +rejected. Once +.B bind +has been invoked, the socket can act as the server side for incoming +RPCs. In addition, +.B setsockopt +may be invoked with the +.B SO_HOMA_SERVER +option to activate or deactivate any socket for incoming requests. +.B SO_HOMA_SERVER +takes an integer argument, where any nonzero value enables incoming +requests and zero disables them. +The current setting can be retrieved with +.BR getsockopt . +.SH ABORTING RPCS +.PP +It is possible for a client to abort RPCs that are in progress by invoking +.B ioctl +with the +.B HOMAIOCABORT +operation. One additional argument must be specified for +.BR ioctl , +consisting of a pointer to the following structure: +.in +4n +.ps -1 +.vs -2 +.EX +struct homa_abort_args { + __u64 id; /* Id of RPC to abort or 0. + __u32 error; /* Errno to use for completion or 0. + __u32 _pad1; /* Must be zero. + __u64 _pad2[2]; /* Must be zero. +}; +.EE +.vs +2 +.ps +1 +.in +.PP +The +.B id +field contains the identifier for an RPC; if this RPC is active on the socket +then it is aborted. If no such RPC exists then the +.B ioctl +returns without doing anything. If +.B id +is zero then all outgoing RPCs for the socket are aborted. +.PP +If +.B error +is 0 then the matching RPCs will be deleted and all state associated +with them will be freed (the RPCs will not be returned by +.BR recvmsg ). +If +.B error +is nonzero then the RPC(s) will immediately be placed in the completed +state so that they can be returned by +.BR recvmsg ; +.B recvmsg +will return an error for each aborted RPC, with an +.B errno +value of +.B error. +Regardless of whether the RPC(s) are completed or freed, the +servers for the RPCs +are not notified of the abort. If a +request has already been transmitted to the server at the time +an abort is requested, it may still be executed on the server. Any response +from the server will be discarded. +.PP +Only outgoing (client-side) RPCs may be aborted. +.SH STATUS INFORMATION +.PP +To retrieve information about the state of a Homa socket, invoke +.B ioctl +with the +.B HOMAIOCINFO +operation. One additional argument must be specified for +.BR ioctl , +consisting of a pointer to the following structure: +.in +4n +.ps -1 +.vs -2 +.EX +struct homa_info { + struct homa_rpc_info *rpc_info; + size_t rpc_info_length; + __u64 bpool_avail_bytes; + __u32 port; + __u32 num_rpcs; + char error_msg[HOMA_ERROR_MSG_SIZE]; +}; +.EE +.vs +2 +.ps +1 +.in +.PP +The caller must set the +.BR rpc_info +and +.B +rpc_info_length +fields before invoking +.BR ioctl. +The +.B rpc_info +field points to an area in application memory that can be used to +return detailed information about all of the active RPCs on the socket; +.B rpc_info_length +indicates how much memory is available at +.BR rpc_info , +in bytes. If +.B rpc_info +is NULL or +.B rpc_info_length +is zero then no detailed RPC information will be returned. +.PP +The other fields are used to return information about the Homa socket +to the application: +.TP 18n +.BR bpool_avail_bytes +The amount of memory currently available in the receive +buffer region for the socket (previously specified with the +.B SO_HOMA_RCVBUF +socket option). +.TP 18n +.BR port +The socket's port number. +.TP 18n +.BR num_rpcs +The number of active RPCs on the socket. +.TP 18n +.BR error_msg +A null-terminated string containing additional information about the +most recent error returne by Homa for a system call on this socket. This +is particularly useful for errors such as +.B EINVAL +and +.BR EFAULT , +where there are numerous possible causes. +.PP +If +.B num_rpcs +is greater than zero and +.B rpc_info +has been specified then details about active RPCs will be returned in +.B rpc_info +(if +.B rpc_info_length +is too small to hold all of the active RPCs, then some RPCs will +not be recorded). Each element of +.B rpc_info +contains information about one RPC, in the following format: +.in +4n +.ps -1 +.vs -2 +.EX +struct homa_rpc_info { + __u64 id; + union { + struct sockaddr_storage storage; + struct sockaddr_in in4; + struct sockaddr_in6 in6; + } peer; + __u64 completion_cookie; + __s32 tx_length; + __u32 tx_sent; + __u32 tx_granted; + __u32 tx_prio; + __s32 rx_length; + __u32 rx_remaining; + __u32 rx_gaps; + __u32 rx_gap_bytes; + __u32 rx_granted; + __u16 flags; +}; +.EE +.vs +2 +.ps +1 +.in +.PP +The fields have the following meaning: +.TP 18n +.BR id +Identifier for the RPC. If the low-order bit is 1, this node is the server +for the RPC; 0 means this node is the client. +.TP 18n +.BR peer +Address of the peer application for the RPC, including both its host address +and port number. +.TP 18n +.BR completion_cookie +For client RPCs, this is the completion cookie specified when +.B sendmsg +was invoked to create the RPC. For server RPCs this field is always zero. +.TP 18n +.BR tx_length +The length of the outgoing message for the RPC (in bytes) or -1 if this +is a server RPC and +.B sendmsg +has not yet been invoked for the RPC's response. +.TP 18n +.B tx_sent +The number of bytes in the outgoing message that have been transmitted +at least once (this is also the index within the message of the first +byte that has not yet been transmitted). +.TP 18n +.BR tx_granted +The number of bytes in the outgoing message that this node is authorized +to transmit (includes any unscheduled bytes). +.TP 18n +.BR tx_prio +The priority level currently being used to transmit data packets. +.TP 18n +.BR rx_length +The length of the incoming message for the RPC (in bytes) or -1 if this +is a client RPC and no packets have been received for the response yet. +.TP 18n +.BR rx_remaining +The number of bytes in the incoming message that have not yet been received. +.TP 18n +.BR rx_gaps +The number of gaps in the incoming message. A gap is a range of bytes that +have not been received while the byte just after the end of the range has +been received. +.TP 18n +.BR rx_gap_bytes +The total number of bytes in all of the gaps of the incoming message. +.TP 18n +.BR rx_granted +The number of bytes in the incoming message that this node has authorized +the peer to transmit (this is also the index within the message of the +first byte that has not been granted). +.TP 18n +.BR flags +A bit mask containing various flags; see below. +.PP +The supported bits in +.B flags +are as follows: +.TP 25n +.BR HOMA_RPC_PRIVATE +This is a client RPC and was declared private when +.B sendmsg +was invoked. +.TP 25n +.BR HOMA_RPC_BUF_STALL +The incoming message has stalled because there is no buffer space +available for it. Transmission of the message will restart when buffer space +becomes available. +.TP 25n +.BR HOMA_RPC_RX_COPY +There exist packets that have been received for the incoming message +whose data has not yet been copied out of +the packet(s) and into the buffer memory for the message. +.TP 25n +.BR HOMA_RPC_RX_READY +The incoming message has been received successully and has been copied +to buffer memory; it is currently queued waiting for a thread to invoke +.BR recvmsg . .SH SHUTDOWN .PP The @@ -255,11 +503,28 @@ the default values should work fine in production. It's probably a bad idea to change any of these unless you are sure you have made detailed performance measurements to justify the change. .TP +.IR action +This value always reads as 0. Writing a nonzero value will cause Homa to +perform one of several actions (such as logging certain information or +freezing the timetrace), depending on the value. +For details on the recognized values, see the method +.BR homa_dointvec +in +.BR homa_plumbing.c . +.TP .I bpage_lease_usecs The amount of time (in microseconds) that a given core can own a page in a receive buffer pool before its ownership can be revoked by a different core. .TP +.IR busy_usecs +An integer value in microsecond units; if a core has been active in +the last +.IR busy_usecs +time, Homa will consider it to be "busy": in some situations Homa +will try to avoid scheduling conflicting activities on that core, in order to +avoid hot spots and achieve better load balancing. +.TP .I cutoff_version (Read-only) The current version for unscheduled cutoffs; incremented automatically when unsched_cutoffs is modified. @@ -278,17 +543,14 @@ aggressively (which could impact application performance) until the number of dead packet buffers drops below .I dead_buffs_limit . .TP -.IR duty_cycle -.I (Note: this feature has been temporarily removed, so this parameter -.I is currently ignored) -Determines the maximum fraction of network bandwidth that a single RPC -will be allowed to consume, in units of one-thousandth (e.g., 500 means 50%). -The main reason for this parameter is that it also limits the fraction -of a core that can be consumed by NAPI processing for a single incoming -message. Without this limit, a large incoming message can completely -consume one core for NAPI, which starves user threads on that core and -can result in high tail latency for short messages served by those -threads. +.IR defer_min_bytes +Messages shorter than this value will always be transmitted immediately, +without worrying about NIC queue length. Messages of this length or greater +will be queued if the NIC queue becomes too long, in order to implement +SRPT for outgoing messages. Short messages are transmitted immediately +because (a) it's unlikely that they can be generated rapidly enough +to produce significant queuing in the NIC and (b) deferring them can overload +the pacer to the point where it cannot keep the uplink fully saturated. .TP .IR fifo_grant_increment An integer value. When Homa decides to issue a grant to the oldest message @@ -310,6 +572,16 @@ Homa will freeze its internal timetrace. This is used for debugging and performance analysis; see the source code for the values currently supported. .TP +.IR gen3_softirq_cores +Used to query and change the set of SoftIRQ cores associated with each +GRO core. When written, the value contains 4 integers. The first is the number +of a core on which GRO processing occurs. The others are core numbers for +up to three other cores; the GRO core will choose from among these cores +when deciding where to direct batches of packets for SoftIRQ processing. +SoftIRQ core numbers of -1 can be used to reduce the number of SoftIRQ +choices. When read, the value contains 4 integers for each core, with the +same format as described above. +.TP .IR grant_fifo_fraction When sending grants, Homa normally uses an SRPT policy, granting to the message(s) with the fewest remaining bytes. This parameter can be @@ -321,46 +593,59 @@ of the bandwidth is for FIFO and 90% for SRPT). As of October 2020, a small value can provide significant benefits for the largest messages under very high loads, but for most loads its effect is negligible. .TP +.IR grant_recalc_usecs +How frequently (in microseconds) to scan the RPCs currently receiving grants +to see if the priority order is still correct. The order can become incorrect +if enough data is arrives for a low-priority RPC so that it now has fewer bytes +left to grant than other RPCs that currently have higher priority. +Validating the order requires the global grant lock, so checking every time data +arrives would risk severe lock contention. Instead, the order is only checked +every +.I grant_recalc_usecs +microseconds. +.TP +.I gro_busy_usecs +An integer value used to determine whether or not to perform some +optimizations specified by +.IR gro_policy . +If the gap between the completion of one call to homa_gro_receive and +the invocation of the next call on the same core is less than this many +microseconds, the core is considered to be "busy", so optimizations +that add to the load of the core will not be performed. +.TP +.I gro_policy +An integer value that determines how Homa processes incoming packets +at the GRO level. See code in homa_offload.c for more details. +.TP .IR gso_force_software If this value is nonzero, Homa will perform GSO in software instead of asking the NIC to perform TSO in hardware. This can be useful when running with NICs that refuse to perform TSO on Homa packets. .TP -.IR gro_policy -An integer value that determines how Homa processes incoming packets -at the GRO level. See code in homa_offload.c for more details. -.TP -.IR gro_busy_usecs -An integer value. Under some -.IR gro_policy -settings, Homa will try not to assign SoftIRQ processing to a core if -it has had GRO-level activity in the last -.IR gro_busy_usecs -microseconds (in order to avoid hot spots that degrade load balancing). +.IR hijack_tcp +An integer value; if nonzero, Homa will transmit its packets as TCP +packets (e.g., using IPPROTO_TCP instead of IPPROTO_HOMA). This allows Homa +to make better use of NIC hardware support such as TSO and RSS, but it +requires Homa to intercept all incoming TCP packets to see if they are +actually Homa packets. Some might object to this interference with the +rest of the Linux kernel. +.TP +.IR homa_share +When there exist both Homa and TCP packets whose transmission has been +deferred because the NIC queue is overloaded, this determines how the +uplink bandwidth is allocated between Homa and TCP. This parameter is +a value between 0 and 100 indicating what percent of the uplink bandwidth +will be allocated to Homa; the remainder will be allocated to TCP. .TP .IR link_mbps An integer value specifying the bandwidth of this machine's uplink to the top-of-rack switch, in units of 1e06 bits per second. .TP -.IR log_topic -This value always reads as 0. Writing a nonzero value will cause Homa to -log various state information to the system log, depending on the value. -For details on the recognized values, consult the Homa code. -.TP .IR max_dead_buffs This parameter is updated by Homa to reflect the largest number of packet buffers occupied by dead (but not yet reaped) RPCs in a single socket at a given time. It may be reset to zero to initiate a new calculation. .TP -.IR max_grant_window -A nonzero value for this parameter enables an experimental new approach to -sending grants that allows more than -.I rtt_bytes -of outstanding grants for messages in some situations. It's not ready -for production use; read the code of the -.I homa_send_grants -method if you want to learn more about it. -.TP .IR max_gro_skbs An integer value setting an upper limit on the number of buffers that Homa will allow to accumulate at driver level before passing them @@ -372,20 +657,45 @@ An integer value setting an upper limit on the size of an output packet, before segmentation using GSO. The Linux networking layer already imposes an upper limit; this configuration value can be used to reduce it further. .TP -.IR max_nic_queue_ns -An integer value specifying a NIC queue length in units of nanoseconds -(how long it will take the existing packets in the queue -to be fully transmitted). -If the NIC queue is longer than this, Homa will wait to queue additional -packets until the queue length drops below this value. -This parameter is used to throttle the NIC output queue in order to -implement SRPT more accurately for outbound messages. +.IR max_incoming +Homa will try to ensure that the total number of bytes authorized +to be sent (but not yet received) by all senders (including both unscheduled +bytes and granted bytes) does not exceed this value. If the known number +of incoming bytes exceeds this value (e.g. because many new messages +have appeared) then Homa will not issue grants until enough data has +been received to get below the limit. Used to control the total +utilization of TOR switch buffers. +.TP +.IR max_link_usage +In order to reduce the likelihood of queues forming in the NIC (which would +reduce the effectiveness of Homa's SRPT policy) Homa limits the rate at +which it submits outgoing packets to the NIC to slightly less than the +full uplink bandwidth. This parameter determines the degree of undercommitment. +It is an integer between 5 and 100 (inclusive) that specifies the maximum +percentage of link bandwidth that Homa will attempt to utilize. 100 means +Homa will attempt to utilize the full bandwidth. Smaller values reduce the +likelihood of queues forming, but also limit uplink utilization, which can +affect performance. Note that queues can sometimes form even with values less +than 100, since most NICs cannot transmit at full link speed under all +conditions. +.TP +.IR max_nic_est_backlog_usecs +This value is used to prevent the buildup of large queues of packets +waiting to be transmitted in the NIC. Homa keeps a running +estimate of how many bytes are currently queued in the NIC, assuming +the NIC transmits at full link speed. If the queue gets long enough +that it will take more than +.I max_nic_est_backlog_usecs +microseconds to transmit all of the queued data, Homa will wait to queue +additional packets until the estimated queue length drops below this value. +Large NIC queues are bad because +they interfere with Homa's SRPT scheduling policy. Once a packet has been queued in the NIC, Homa cannot schedule a higher priority back in front of it; the longer the queue, the longer the delay for a newly arriving high priority packet. Lower values for this parameter reduce preemption lag and result in a better approximation of SRPT, but the value must be high enough to -queue the next packet before +allow time to queue the next packet before the NIC becomes idle; otherwise, output bandwidth will be lost. .TP .IR max_overcommit @@ -395,6 +705,17 @@ numbers generally improve link bandwidth utilization, but can result in more buffering and may affect tail latency if there are not many priority levels available. Must be at least 1. .TP +.IR max_rpcs_per_peer +In Homa's original design, if there were multiple incoming RPCs from the +same peer, Homa would only send grants to the highest-priority of them. The +thought was that this RPC could consume all of the link bandwidth at both +sender and receiver, so there would be no point in granting to additional RPCs +from that peer. However, with faster networks, it isn't currently +possible for Homa to saturate a link with a single RPC. Homa will now +grant to multiple RPCs from the same peer; this integer value limits +the number of active RPCs from a single peer that Homa will grant at +once. +.TP .IR max_sched_prio (Read-only) An integer value specifying the highest priority level that Homa will use for scheduled packets; priority levels larger than this @@ -403,6 +724,13 @@ This parameter is set automatically by Homa when .I unsched_cutoffs is modified. .TP +.IR next_id +(Write-only) Setting this parameter will cause Homa to assign identifiers +for future outgoing RPCs starting at this value. This is typically used +during debugging to ensure that different nodes use different id ranges +(which simplifies some tools). Changing the value could be dangerous +in production. This parameter always reads as zero. +.TP .IR num_priorities The number of priority levels that Homa will use; Homa will use this many consecutive priority level starting with 0 (before priority mapping). @@ -419,6 +747,41 @@ however, under very extreme loads a small value does provide benefit for the largest messages, when used with .I grant_fifo_fraction. .TP +.IR peer_gc_threshold +.PD 0 +.TP +.IR peer_idle_secs_min +.TP +.IR peer_idle_secs_max +.TP +.IR peer_net_max +.IP +These options control garbage collection of peer objets. Homa maintains +long-lived state for each peer machine that it has communicated with; peer +objects are kept separately for each network namespace. +These options are used to limit memory utilization from peer objects. If the +total number of peer objects across all namespaces is less than +.IR peer_gc_threshold +then no peer garbage collection occurs. If the number of peer objects is +at least +.IR peer_gc_threshold +then Homa will free peers that have not been referenced in the last +.IR peer_idle_secs_max +in order to reduce the total number of peer objects below +.IR peer_gc_threshold . +In addition, if a given network namespace has more than +.IR peer_net_max +peers allocated, then peers in that namespace are candidates for +freeing if they have not been referenced in the last +.IR peer_idle_secs_min +seconds. When choosing among candidates to free, Homa uses a semi-random +approach that +(a) prefers to evict peers from namespaces above the +.IR peer_net_max +threshold over those from underloaded namespaces +and (b) prefers to evict peers whose most recent usage is farthest in the past. +.PD +.TP .IR poll_usecs When a thread waits for an incoming message, Homa first busy-waits for a short amount of time before putting the thread to sleep. If a message arrives @@ -454,8 +817,7 @@ on servers. .TP .IR resend_interval An integer value specifying how frequently resend requests may be sent -to a given peer (regardless of how many RPCs are outstanding to that -peer). This is in units +for a given missing packet. This is in units of "ticks" (see .I resend_ticks below). This value and @@ -482,26 +844,32 @@ value of .IR resend_ticks , the client issues a RESEND. Since the message has not yet been processed on the server, it sends UNKNOWN, causing the client to restart. A larger value of -.IR -resend_ticks +.IR resend_ticks reduces the likelihood of restarts (but doesn't completely eliminate the problem). .TP .IR rtt_bytes -An estimate of the number of bytes that can be transmitted on the wire -by a host in the time it takes that host to send a full-size packet to -another host and receive back a grant packet. Used by Homa to ensure -full network bandwidth utilization (or whatever is specified by the -.IR duty_cycle -parameter). -.TP -.IR sync_freeze -If a nonzero value is written into this parameter, then upon completion -of the next client RPC issued from this machine, Homa will will clear -this parameter back to 0, then freeze the -local timetrace and also the timetrace of the server for the RPC. This -is useful during debugging to extract timetraces for the same interval -on multiple machines. +This configuration parameter is no longer supported; it has been split +into two different parameters: +.IR unsched_bytes +and +.IR window . +.TP +.IR skb_page_frees_per_sec +Homa maintains a pool of free pages on each NUMA node for use in +outgoing sk_buffs, in order to eliminate the overhead of allocating +new pages from scratch. This option specifies the total rate (across all +pools, not per-pool) at which pages should be released from pools back to +Linux, in pages per second. The idea behind this parameter is to release +pages slowly enough that replenishing them won't add significant overhead if +they are still needed, while also ensuring that pools don't retain a lot more +pages than needed. +.TP +.IR skb_page_pool_min_kb +When releasing pages from the sk_buff page pools back to Linux, Homa will +not release pages from a pool if the amount of unused space in +the pool has been less than this (specified in Kbytes) at any point +in the recent past. .TP .IR throttle_min_bytes An integer value specifying the smallest packet size subject to @@ -522,6 +890,10 @@ receiving any packets from the peer, then Homa will consider the peer dead and abort all RPCs involving that peer with .BR ETIMEDOUT . .TP +.IR unsched_bytes +The number of bytes that may be transmitted from a new message without +waiting for grants from the receiver. +.TP .IR unsched_cutoffs An array of 8 integer values. The nth element specifies the largest message size, in bytes, for which priority level n will be used. @@ -536,6 +908,23 @@ this will be used for scheduled packets. .IR verbose An integer value; nonzero means that Homa will generate additional log output. +.TP +.IR window +The maximum number of unreceived bytes that the receiver may grant for +a message at a given time. If this value is zero, then receivers will +use a dynamic approach that depends on the number of grantable messages; +with fewer grantable messages, the window for each message increases. +Specifically, if there are N grantable messages, the window for each +of these messages will be +.IR max_incoming /(N+1). +This approach was inspired by the paper "Dynamic Queue Length Thresholds +for Shared-Memory Packet Switches"; the idea is to maintain unused +granting capacity equal to the window for each of the current messages. +.TP +.IR wmem_max +Maximum amount of memory that may be used for outgoing packet buffers +by a single socket at a given time. Output message transmissions will +block when this limit is reached. .SH /PROC FILES .PP In addition to files for the configuration parameters described above, @@ -559,7 +948,4 @@ the core number for the following lines. A few counters appear before the first "core" line: these are core-independent counters such as elapsed time. .SH SEE ALSO .BR recvmsg (2), -.BR sendmsg (2), -.BR homa_abort (3), -.BR homa_reply (3), -.BR homa_send (3) +.BR sendmsg (2) diff --git a/man/homa_abort.3 b/man/homa_abort.3 deleted file mode 100644 index edecbafc..00000000 --- a/man/homa_abort.3 +++ /dev/null @@ -1,93 +0,0 @@ -.TH HOMA_ABORT 3 2021-08-24 "Homa" "Linux Programmer's Manual" -.SH NAME -homa_abort \- terminate an outgoing RPC -.SH SYNOPSIS -.nf -.B #include -.PP -.BI "int homa_abort(int " sockfd ", uint64_t " id ", int " error ); -.PP -.BI "int homa_abortp(int " sockfd ", struct homa_abort_args *" args ); -.fi -.SH DESCRIPTION -These two functions will cancel the execution of one (or all) outgoing RPCs. -They behave identically except that -.BR homa_abort -receives its arguments as separate parameters, whereas -.BR homa_abortp -packs all of the arguments into a structure: -.PP -.in +4n -.ps -1 -.vs -2 -.EX -struct homa_abort_args { - uint64_t id; - int error; -}; -.EE -.vs +2 -.ps +1 -.in -.PP -The -.I id -argument contains the identifier for an RPC; if this RPC is active on -.IR sockfd -then it is aborted. -If -.I id -is 0 then all outgoing RPCs on -.IR sockfd -will be aborted. -If -.I error -is 0, then the matching RPCs will be deleted and all state associated -with them will be freed (the RPCs will not -be returned by -.BR homa_recv ). -If -.I error -is nonzero, then the RPC(s) will immediately be placed in the completed -state so that they can be returned by -.BR homa_recv ; -the -.BR homa_recv -call will return an error, with an -.I errno -value of -.I error. -Regardless of whether the RPC(s) are completed or freed, the -servers for the RPCs -are not notified of the abort. If a -request has already been transmitted to the server at the time -.B homa_abort -is invoked, it may still be executed on the server. Any response -from the server will be discarded. - -.SH RETURN VALUE -On success, the return value is 0. -On error, \-1 is returned and -.I errno -is set appropriately. - -.SH ERRORS -.TP -.B EALREADY -.I error -and -.I id -were both nonzero, but the RPC was already in the completed state. In this -case the system call has no effect. -.TP -.B EFAULT -An invalid user space address was specified for an argument. -.TP -.B EINVAL -There is no RPC corresponding to -.IR id . -.SH SEE ALSO -.BR homa_recv (3), -.BR homa_reply (3), -.BR homa_send (3), -.BR homa (7) diff --git a/man/homa_reply.3 b/man/homa_reply.3 deleted file mode 100644 index 4a898cb9..00000000 --- a/man/homa_reply.3 +++ /dev/null @@ -1,92 +0,0 @@ -.TH HOMA_REPLY 3 2022-12-13 "Homa" "Linux Programmer's Manual" -.SH NAME -homa_reply, homa_replyv \- send a Homa response message -.SH SYNOPSIS -.nf -.B #include -.PP -.BI "int homa_reply(int " sockfd ", const void *" message_buf ", size_t " \ -length , -.BI " const struct sockaddr *" dest_addr ", size_t " \ -addrlen , -.BI " uint64_t " id );> -.PP -.BI "int homa_replyv(int " sockfd ", const struct iovec *" iov ", size_t " \ -iovcnt , -.BI " const struct sockaddr *" dest_addr ", size_t " \ -addrlen , -.BI " uint64_t " id ); -.fi -.SH DESCRIPTION -.BR homa_reply -and -.BR homa_replyv -are convenience functions layered on top of the -.B sendmsg -system call. -Either may be used to transmit a response message using the Homa -transport protocol. -The argument -.I sockfd -is the file descriptor of a Homa socket to use for sending the response. -With -.BR homa_reply -the response message is stored in a single contiguous buffer pointed to by -.IR message_buf , -and the argument -.I length -gives the length of the message in bytes. -With -.BR homa_replyv -the response message consists of multiple disjoint chunks, specified -by -.I iovcnt -descriptors at -.IR iov . -In either case the total message length must not exceed -.BR HOMA_MAX_MESSAGE_LENGTH . -The destination for the response is given by -.I dest_addr , -which can hold either an IPv4 or an IPv6 address: -.PP -.in +4n -.ps -1 -.vs -2 -.EX -typedef union sockaddr_in_union { - struct sockaddr sa; - struct sockaddr_in in4; - struct sockaddr_in6 in6; -} sockaddr_in_union; -.EE -.vs +2 -.ps +1 -.in -.PP -The argument -.I id -is an identifier previously returned by -.BR recvmsg (2); -along with -.IR dest_addr , -it identifies the request for which this message is the response. -.PP -This function returns as soon as the response has been queued for -transmission. -.SH RETURN VALUE -On success, the return value is 0. -On error, \-1 is returned and -.I errno -is set appropriately. -.SH ERRORS -See -.BR sendmsg (2) -for details on the -.I errno -values returned after errors. -.SH SEE ALSO -.BR recvmsg (2), -.BR sendmsg (2), -.BR homa_abort (3), -.BR homa_send (3), -.BR homa (7) diff --git a/man/homa_send.3 b/man/homa_send.3 deleted file mode 100644 index 56f0bad3..00000000 --- a/man/homa_send.3 +++ /dev/null @@ -1,108 +0,0 @@ -.TH HOMA_SEND 3 2022-12-13 "Homa" "Linux Programmer's Manual" -.SH NAME -homa_send, homa_sendv \- send a request message -.SH SYNOPSIS -.nf -.B #include -.PP -.BI "int homa_send(int " sockfd ", const void *" message_buf ", size_t " length \ -", const sockaddr_in_union *" dest_addr ", -.BI " uint64_t *" id ", uint64_t " \ -"completion_cookie" ); -.PP -.BI "int homa_sendv(int " sockfd ", const struct iovec *" iov ", size_t " \ -iovcnt ", const sockaddr_in_union *" dest_addr , -.BI " uint64_t *" id ", uint64_t " \ -"completion_cookie" ); -.fi -.SH DESCRIPTION -.BR homa_send -and -.BR homa_sendv -are convenience functions layered on top of the -.B sendmsg -system call. -Either may be used to transmit a request message using the Homa -transport protocol. -The argument -.I sockfd -is the file descriptor of the sending socket; this must be a Homa socket. -With -.BR homa_send -the request message is stored in a single contiguous buffer pointed to by -.IR message_buf , -and the argument -.I length -gives the length of the message in bytes. -With -.BR homa_sendv -the request message consists of multiple disjoint chunks, specified -by -.I iovcnt -descriptors at -.IR iov . -In either case, the total message length must not exceed -.BR HOMA_MAX_MESSAGE_LENGTH . -The destination socket for the request is given by -.IR dest_addr , -which can hold either an IPv4 or IPv6 address: -.PP -.in +4n -.ps -1 -.vs -2 -.EX -typedef union sockaddr_in_union { - struct sockaddr sa; - struct sockaddr_in in4; - struct sockaddr_in6 in6; -} sockaddr_in_union; -.EE -.vs +2 -.ps +1 -.in -.PP -If -.I id -is not NULL, an identifier for the request is returned at -.IR *id. -The identifier will be unique among all requests issued on -.IR sockfd , -and can be used to match the request with a response returned later by -.BR homa_reply (3). -The -.I completion_cookie -argument provides application-specific identifying information about the RPC, -such as the address of a data structure used to manage the -RPC; it will be returned by -.BR homa_recv -when the RPC completes. -.PP -This function returns as soon as the message has been queued for -transmission. - -.SH RETURN VALUE -On success, the return value is 0 and an identifier for the request -is stored in -.I *id -(if -.I id -is not NULL). -The identifier can be used later to match the request -with the corresponding response, using -.BR homa_reply (3). -On error, \-1 is returned and -.I errno -is set appropriately. -.SH ERRORS -After an error return, -.I errno -will contain additional information about the cause of the error. -See -.BR sendmsg (2) -for details. -.SH SEE ALSO -.BR recvmsg (2), -.BR sendmsg (2), -.BR homa_abort (3), -.BR homa_reply (3), -.BR homa (7) diff --git a/man/recvmsg.2 b/man/recvmsg.2 index 2ef5d602..5d094483 100644 --- a/man/recvmsg.2 +++ b/man/recvmsg.2 @@ -1,4 +1,4 @@ -.TH RECVMSG 2 2022-12-13 "Homa" "Linux Programmer's Manual" +.TH RECVMSG 2 2024-11-11 "Homa" "Linux Programmer's Manual" .SH NAME recvmsg \- receive a Homa message .SH SYNOPSIS @@ -15,12 +15,12 @@ on Homa sockets. The .I sockfd argument must refer to a Homa socket. The .I msg -argument describes which incoming messages are of interest, and is +argument describes which incoming messages are of interest and is used to return information about the message that is received. The .I flags -argument is not used for its +argument is not used except for its .B MSG_DONTWAIT -bit, which can be used to request nonblocking behavior +bit, which can be used to request nonblocking behavior. .PP The .B msg @@ -57,13 +57,13 @@ field must refer to a structure of the following type: .vs -2 .EX struct homa_recvmsg_args { - uint64_t id; /* If nonzero, specifies id of - * desired RPC. */ - uint64_t completion_cookie; /* Value from sendmsg for request. */ - int flags; /* OR-ed combination of bits. */ - uint32_t num_bpages; /* Number of valid entries in + uint64_t id; /* If nonzero, specifies id of + * desired RPC, which must be + * private. */ + uint64_t completion_cookie; /* Value from sendmsg for request. */ + uint32_t num_bpages; /* Number of valid entries in * bpage_offsets. */ - uint32_t bpage_offsets[HOMA_MAX_BPAGES] /* Tokens for buffer pages. */ + uint32_t bpage_offsets[HOMA_MAX_BPAGES]; /* Tokens for buffer pages. */ }; .EE .vs +2 @@ -78,28 +78,25 @@ structs are used both for passing parameter information into Homa and for receiving result information: .nr step 1 1 .IP \[bu] 2 -The input values for -.B flags -and +The initial value of .B id -indicate which messages are of interest to the caller. -.B Flags -is a bitmask. If it contains the -.B HOMA_RECVMSG_REQUEST -bit, then the caller is interested in any request message. -If it contains the -.B HOMA_RECVMSG_RESPONSE -bit, then the caller is interested in any response message. -If the -.B HOMA_RECVMSG_RESPONSE -bit is zero but +indicates what message(s) the caller is interested in, in one of two ways. +If the value is nonzero, it gives the RPC identifier for a specific +RPC; it must be an RPC for which the caller is client, and it must +have been specified as +.I private +when +.B sendmsg +was invoked to initate the RPC. In this case, +.B recvmsg +will wait for the response message for that RPC. +If the initial value of .B id -is nonzero, then the caller is interested in receiving a response -for the RPC given by -.B id. +is zero, then the system call will wait for any message that is not a private +response (the message could be either a request or response). .IP \[bu] -Homa will use the structs to return information about the message received. -The +On a successful return Homa will use the structs to return information +about the message received. The .B id field will be set to the RPC identifier for the received message (if .B id @@ -111,7 +108,7 @@ is not NULL, then a .B sockaddr_in or .B sockaddr_in6 -will be stored at its target (depnding on the address family of +will be stored at its target (depending on the address family of .IR sockfd ), describing the sender of the message. For response messages, the @@ -171,24 +168,22 @@ is irrelevant, and a single .B recvmsg call can include bpages from multiple messages; all that matters is that each bpage is returned to Homa exactly once. +.IP \[bu] +.IR msg ->\c +.B msg_controllen +will be set to zero by the call. .PP .B recvmsg normally waits until a suitable message has arrived, but nonblocking -behavior may be requested in any of three ways. First, the -.BR HOMA_RECVMSG_NONBLOCKING -bit may be set in the -.B flags -field of the -.BR homa_recvmsg_args -struct. Second, the +behavior may be requested either by setting the .BR MSG_DONTWAIT -bit can be set in the +bit in the .BR flags argument to -.BR recvmsg . -Third, the +.BR recvmsg +or by setting the .B O_NONBLOCK -flag may be set for the socket using +flag for the socket using .BR fcntl . If .B recvmsg @@ -198,13 +193,14 @@ it will fail with an value of .BR EAGAIN . .SH RETURN VALUE -The return value is 0 for success and -1 if an error occurred. If +The return value is the length of the message in bytes for success and +-1 if an error occurred. If .B id is zero after an error, it means that the failure prevented a message from being received. If .B id -is zero, it means that a specific RPC has failed. +is nonzero, it means that a specific RPC has failed. For request messages, this can happen after errors such as .B EFAULT errors (the kernel couldn't write to user space to return @@ -221,14 +217,19 @@ or can occur if there was no server at the specified address, it couldn't be reached, or it timed out, respectively. .B ENOMEM -can also occur for responses. +can also occur for responses. If +.B id +is nonzero and even (i.e. the message that failed was a response) then the +.B completion_cookie +field will also be set. .PP After sucessfully receiving a message, an application has two responsibilities. First, it must eventually return the message's bpages to Homa as described above. Second, if the message is a request, the application must eventually send a response (Homa retains state for each request until its response has been sent; if -no responses are sent, kernel state will accumulate without bound). +no responses are sent, kernel state will accumulate and the socket may +eventually block). .SH ERRORS .PP When @@ -282,8 +283,5 @@ was sent. The socked has been disabled using .BR shutdown (2). .SH SEE ALSO -.BR recvmsg (2), -.BR homa_abort (3), -.BR homa_reply (3), -.BR homa_send (3), +.BR sendmsg (2), .BR homa (7) diff --git a/man/sendmsg.2 b/man/sendmsg.2 index c1daed8e..e0784d69 100644 --- a/man/sendmsg.2 +++ b/man/sendmsg.2 @@ -1,4 +1,4 @@ -.TH SENDMSG 2 2022-12-14 "Homa" "Linux Programmer's Manual" +.TH SENDMSG 2 2023-11-2 "Homa" "Linux Programmer's Manual" .SH NAME sendmsg \- send a Homa request or response message .SH SYNOPSIS @@ -47,7 +47,7 @@ struct msghdr { Homa requires additional information beyond what can be represented in a .BR "struct msghdr" , so the -.B msg_controllen +.B msg_control field must refer to a structure of the following type: .PP .in +4n @@ -56,8 +56,9 @@ field must refer to a structure of the following type: .EX struct homa_sendmsg_args { uint64_t id; /* RPC identifier. */ - uint64_t completion_cookie; /* For requests only; value to return + __u64 completion_cookie; /* For requests only; value to return * along with response. */ + __u32 flags; /* OR'ed combination of bits. */ }; .EE .vs +2 @@ -92,8 +93,47 @@ and .IR msg ->\c .BR msg_name . .PP +The +.B id +field of +.B homa_sendmsg_args +contains an OR'ed collection of bits. At present only a single +flag bit is supported. +.TP +.B HOMA_SENDMSG_PRIVATE +Ignored when sending responses +.RB ( id " is 0)." +For requests, this bit will mark the RPC as +.IR private . +The response for a +private RPC can only be retrieved by specifying the RPC's id explicitly +when invoking +.BR recvmsg . +In addition, system calls such as +.BR select (2) +cannot be used to determine when a private response has arrived. +.PP .B sendmsg returns as soon as the message has been queued for transmission. +.B sendmsg +will block if there are so many outstanding messages on the socket that +its limit on write buffer memory has been exceeded. However, if the +.BR MSG_DONTWAIT +bit is set in the +.BR flags +argument to +.BR sendmsg +or the +.B O_NONBLOCK +flag is set for the socket using +.BR fcntl +then +.B sendmsg +will will fail with an +.I errno +value of +.BR EAGAIN +instead of blocking. .SH RETURN VALUE The return value is 0 for success and -1 if an error occurred. .SH ERRORS @@ -138,7 +178,4 @@ The socked has been disabled using .BR shutdown (2). .SH SEE ALSO .BR recvmsg (2), -.BR homa_abort (3), -.BR homa_reply (3), -.BR homa_send (3), .BR homa (7) diff --git a/murmurhash3.h b/murmurhash3.h new file mode 100644 index 00000000..1ed1f0b6 --- /dev/null +++ b/murmurhash3.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ + +/* This file contains a limited implementation of MurmurHash3; it is + * used for rhashtables instead of the default jhash because it is + * faster (25 ns. vs. 40 ns as of May 2025) + */ + +/** + * murmurhash3() - Hash function. + * @data: Pointer to key for which a hash is desired. + * @len: Length of the key; must be a multiple of 4. + * @seed: Seed for the hash. + * Return: A 32-bit hash value for the given key. + */ +static inline u32 murmurhash3(const void *data, u32 len, u32 seed) +{ + const u32 c1 = 0xcc9e2d51; + const u32 c2 = 0x1b873593; + const u32 *key = data; + u32 h = seed; + + len = len >> 2; + for (size_t i = 0; i < len; i++) { + u32 k = key[i]; + + k *= c1; + k = (k << 15) | (k >> (32 - 15)); + k *= c2; + + h ^= k; + h = (h << 13) | (h >> (32 - 13)); + h = h * 5 + 0xe6546b64; + } + + /* Total number of input bytes */ + h ^= len * 4; + + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + return h; +} diff --git a/notes.txt b/notes.txt index e95a1b71..b008fc3d 100755 --- a/notes.txt +++ b/notes.txt @@ -1,15 +1,161 @@ Notes for Homa implementation in Linux: --------------------------------------- -* IPv6 issues: - * See if error checking made syscalls slower. +* (12/12/25) Something is wrong with the xl170 cluster: both Homa and TCP + are showing considerably worse performance than previously. I tried multiple + different clusters, and tried backing out to older versions of Homa and + Linux, but the problems don't go away. -* Refactor of granting mechanism: - * Eliminate grant_increment: change to fifo_grant_increment instead - * grant_non_fifo may need to grant to a message that is also receiving - regular grants - * What if a message receives data beyond incoming, which completes the - message? +* Performance problems to track down: + * On xl170s, both TCP and Homa run slower with qdisc than pacer (P99 for + TCP small packets increases by 50%) + * On c6620 cluster, "cp_node client --workload 500000" gets 20 GBps each + way with TCP but only 15 Gbps with Homa. + +* Significant new functionality: + * Run Homa SoftIRQ on the GRO core? This would eliminate the latency and + cache overheads of switching cores, and RSS already provides adequate + load balancing. Possibly even invoke GRO during SoftIRQ? + * Don't transmit client port number. Instead, vary the contents of this + field to generate packet spraying. + * Implement zero-copy for output (may work only on client side). + * Change grant mechanism so that each message is either entirely + scheduled or entirely unscheduled (see July 2024 note below). + * Fix Homa to ensure at-most-once semantics (see "execution multiple + times" below). + * Implement some of the ideas from SIRD to reduce buffer utilization. + * Refactor the granting mechanism to grant separately for each device. + +* Move interest cleanup code from homa_sock to a new function in + homa_interest. Also move wakeup code from homa_rpc_handoff. + +* Find a way to reduce pacer core utilization? It currently takes about 0.7 + core when running at high load. Maybe use polling threads instead? + +* Eliminate use of link_mbps in homa_grant.c; perhaps replace with + configuration parameter fifo_mbps? Maybe the grant mechanism needs + to be net_device-specific? + +* Eliminate HOMA_FLAG_DONT_THROTTLE + +* Optimizations for skb freeing: + * In GRO, merge page frags out of skbs and return skbs to napi with + napi_reuse_skb (return GRO_MERGED_FREE?). See also napi_get_frags (used + by the driver?). + * Apparently TCP has a faster way of eventually freeing the merged skb + (return things to the allocating core): see tcp_eat_recv_skb. + * This uses the function skb_attempt_defer_free, which is not currently + exported for extensions. + +* Rework cp_node so that there aren't separate senders and receivers on the + client. Instead, have each client thread send, then conditionally receive, + then send again, etc. Hmmm, I believe there is a reason why this won't + work, but I have forgotten what it is. + +* (July 2024) Found throughput problem in 2-node "--workload 50000 --one-way" + benchmark. The first packet for message N doesn't get sent until message + N-1 has been completely transmitted. This results in a gap between the + completion of message N and the arrival of a grant for message N+1, which + wastes throughput. Perhaps send an empty data packet for message N+1 so that + the grant can return earlier? Or, make a bigger change: + * Either a message is entirely scheduled or entirely unscheduled. + * For a scheduled message, send a 0-length data packet immediately, + before staring to copy in from user space. + * The round-trip for a grant can now happen in parallel with copying + bytes in from user space (it takes 16 usec to copy in 60 KB at 30 Gbps). + * The grant will be sent more quickly by the server because it doesn't + have to process a batch of data packets first + +* The current implementation can execute RPCs multiple times on the server: + * The initial RPC arrives, but takes the slow path to SoftIRQ, which can + take many ms. + * Meantime the client retries, and the retry succeeds: the request is + processed, the response is returned to the client, and the RPC is + freed. + * Eventually SoftIRQ wakes up to handle the original packet, which re-creates + the RPC and it gets serviced a second time. + +* Don't understand why W3 performance is so variable under Gen3. Also, + it's worth comparing tthoma output for W4 and W3 under Gen2; e.g., + W3 has way more active outgoing messages than W4. + +* When requesting ACKs, must eventually give up and delete RPC. +* Eliminate msgin.scheduled: just compute when needed? +* Replace msgin.bytes_remaining with bytes_received? +* Include the GSO packet size in the initial packet, so the receiver + can predict how much grants will get rounded up. +* Out-of-order incoming grants seem to be pretty common. +* Round up bpage fragments to 64 byte boundaries + +* CloudLab cluster issues: + * amd272 had a problem where all xmits from core 47 incurred a 1-2 ms + delay; power-cycling it fixed the problem. + * (July 2025) amd163 was causing cp_vs_tcp to run slowly (max cluster + throughput only 16-18 Gbps). + +* Notes on performance data from buffer benchmarking 8/2023: + * Restricting buffer space ("buffers"): + * Homa W5: unrestricted usage 3.8 MB/20 nodes + Performance degrades significantly below 80% of this (3 MB/20 nodes) + Homa still has better performance than DCTCP down to 1.9 MB + * DCTCP W5: unrestricted usage 5.8 MB/20 nodes + Performance degrades significantly below 25% of this (1.4 MB/20 nodes) + * TCP W5: unrestricted usage 13.1 MB/20 nodes (all available space) + Performance degrades significantly below 80% of this (10.5 MB/20 nodes) + * Varying the number of nodes ("nodes"): + * Homa usage seems to increase gradually with number of nodes + * TCP/DCTCP usage decreases with number of nodes + * Varying the link utilization ("link_util"): + * Homa: 50% utilization drops min. buffer usage by ~4x relative to 80% + Large messages suffer more at higher utilization + * DCTCP benefits less from lower link utilization: min buffer usage + drops by 35% at 50% utilization vs. 80% utilization + All message sizes benefit as utilization drops + * At 50% utilization, Homa's min buffer usage is less than DCTCP! (recheck) + * TCP benefits more than DCTCP: 3x reduction in min buffers at 50% vs. 80% + * Varying the DCTCP marking threshold: + * Min buffer usage increases monotonically with marking threshold + * Slowdown is relatively stable (a bit higher at largest and smallest + thresholds). + * For small messages, P50 is better with low threshold, but P99 is higher + * Dynamic windows for Homa: + * Min buffer usage increases monotonically with larger windows + * Slowdown drops, then increases as window size increases + * Unclear whether there is a point where dwin has a better combination + of performance and buffer usage; maybe in the range of 3-4x rttbytes? + * Varying Homa's overcommitment: + * Results don't seem consistent with other benchmarks + * For W5, dropping overcommitment to 4 increases slowdown by 3x, drops + min buffer usage by 2x + * Even an overcommit of 7 increases slowdown by 2x? + * Varying unsched_bytes: + * For W4 and W5, increasing unsched_bytes from 40K to 120K increases + min buffer usage by 2x + +* Issues for next round of full-rack testing: + * Verify that unrestricted slowdown curves for the buffer benchmarks match + the "standard" curves. + * Some metrics seem to vary a lot: + * Unrestricted buffer usage in "nodes" vs. "buffers" vs. "link_util" + * 10% drop in Homa performance: "nodes" vs. "buffers" vs. "link_util" + * DCTCP buffer usage in "nodes" first, drops, then starts to rise; + recheck? + * Need more nodes to see if buffer usage/node eventually plateaus (need to + run all tests at 40 nodes) + * Explore dynamic windows more thoroughly + * Overcommit results look fishy + * Lower the threshold for assuming 0 min buffer usage (perhaps base on us + rather than MB?) + * Create a "scan_metrics" method to scan .metrics files and look for + outliers in multiple different ways? + * Compute a "P99 average" by computing average of worst 1% slowdowns + in 10 buckets (100?) according to message length? + * unsched for w3 has huge variaton in rtts; 40k is particularly bad. It + isn't keeping up on BW either; try raising client-max? Also, there seem + to be too many metrics files (the problem is with active_nodes in cperf.py?) + +* recvmsg doesn't seem to return an address if there is an error? May + need to return the address in a different place? * Pinning memory: see mm.h and mm/gup.c * get_user_page @@ -17,38 +163,15 @@ Notes for Homa implementation in Linux: * pin_user_page (not sure the difference from get_user_page) * Performance-related tasks: - * Improve software GSO by making segments refer to the initial large - buffer instead of copying? - * Rework granting to - * Implement sk_buff caching for output buffers: - * Allocation is slow (2-10 us on AMD processors; check on Intel?) - * Large buffers exceed KMALLOC_MAX_CACHE_SIZE, so they aren't cached - in slabs - * Keep free lists in Homa for different sizes (e.g. pre-GSO and GSO), - append output buffers there - * Can recycle an sk_buff by calling build_skb_around(). * Rework FIFO granting so that it doesn't consider homa->max_overcommit (just find the oldest message that doesn't have a pity grant)? Also, it doesn't look like homa_grant_fifo is keeping track of pity grants precisely; perhaps add another RPC field for this? - * Re-implement the duty-cycle mechanism. Use a generalized pacer to - control grants: - * Parameters: - * Allowable throughput - * Max accumulation of credits - * Methods: - * Request (current time, amount) (possibly 2 stages: isItOk and doIt?) - * Or, just reduce the link speed and let the pacer handler this? - * Analyze 40-us W4 short message latency by writing a time-trace - analyzer that tracks NIC queue length. * Perhaps limit the number of polling threads per socket, to solve the problems with having lots of receiver threads? * Move some reaping to the pacer? It has time to spare * Figure out why TCP W2 P99 gets worse with higher --client-max * See if turning off c-states allows shorter polling intervals? - * Consider a permanent reduction in rtt_bytes. - * Consider reducing throttle_min_bytes to see if it helps region 1 - in the CDF? * Modify cp_node's TCP to use multiple connections per client-server pair * Why is TCP beating Homa on cp_server_ports? Perhaps TCP servers are getting >1 request per kernel call? @@ -63,15 +186,6 @@ Notes for Homa implementation in Linux: everything up to the latest received offset. * Try more aggressive retries (e.g. if a missing packet is sufficiently long ago, don't wait for timeout). - * Eliminate hot spots involving NAPI: - * Arrange for incoming bursts to be divided into batches where - alternate batches do their NAPI on 2 different cores. - * To do this, use TCP for Homa! - * Send Homa packets using TCP, and use different ports to force - different NAPI cores - * Interpose on the TCP packet reception hooks, and redirect - real TCP packets back to TCP. - * Consider replacing grantable list with a heap? * Unimplemented interface functions. * Learn about CONFIG_COMPAT and whether it needs to be supported in struct proto and struct proto_ops. @@ -83,7 +197,6 @@ Notes for Homa implementation in Linux: * Socket not supported on server (or server process ends while processing request). * Server timeout - * Is it safe to use non-locking skb queue functions? * Is the RCU usage for sockets safe? In particular, how long is it safe to use a homa_sock returned by homa_find_socket? Could it be deleted from underneath us? This question may no longer be relevant, given the @@ -91,16 +204,12 @@ Notes for Homa implementation in Linux: * Can a packet input handler be invoked multiple times concurrently? * What is audit_sockaddr? Do I need to invoke it when I read sockaddrs from user space? - * When a struct homa is destroyed, all of its sockets end up in an unsafe - state in terms of their socktab links. * Clean up ports and ips in unit_homa_incoming.c * Plug into Linux capability mechanism (man(7) capabilities) * Don't return any errors on sends? * Homa-RAMCloud doesn't retransmit bytes if it transmitted other bytes recently; should HomaModule do the same? Otherwise, will retransmit for requests whose service time is just about equal to the resend timer. - * Check tcp_transmit_skb to make sure we are doing everything we need to - do with skbuffs (e.g., update sk_wmem_alloc?) * Add support for cgroups (e.g. to manage memory allocation) * Questions for Linux experts: @@ -137,120 +246,13 @@ Notes for Homa implementation in Linux: * If there is an error in ip_queue_xmit, does it free the packet? * The answer appears to be "yes", and Homa contains code to check this and log if not. - * How to compute the *real* number of CPUS (<< NR_CPUS?) * Is there a better way to compute packet hashes than Homa's approach in gro_complete? -* Notes on IP packet transmission and reception: - * ip_queue_xmit -> ip_local_out -> dst_output - * Ultimately, output is handled by skb_dst(skb)->output(net, sk, skb), - which probably is ip_output - * ip_output -> ip_finish_output -> ip_finish_output2 -> neigh_output? - * Incoming packets: - * Interrupt handlers pass packets to netif_rx - * It queues them in a per-CPU softnet_data structure - * RPS: Receive Packet Steering - * On the destination core, __netif_receive_skb_core is eventually invoked? - * ip_rcv eventually gets called to handle all incoming IP packets - * ip_local_deliver_finish finally calls Homa - -* Notes on skbuff usage: - * skb->destructor: invoked when skbuff is freed. - * sk->sk_wmem_alloc: - * Keeps track of memory in write buffers that are being transmitted. - * Prevents final socket cleanup - * Has an extra increment of 1, set when socket allocated - and removed in sk_free (so cleanup won't be done until socket - has been freed) - * sk->sk_write_space: invoked to signal that write space has become available - * skb->truesize: total amount of memory required by this skbuff, including - both the data block and the skbuff header. - * sock_wmalloc: allocates new buffer for writing, limiting to sk->sk_sndbuf - and charging against sk->sk_wm_alloc - * sk->sk_sndbuf: Maximum about of write buffer space that this socket can - consume - * sk->sk_wmem_queued: "persistent queue size" (perhaps buffers that are - queued but not yet ready to transmit?) - * sk->sk_rmem_alloc: appears to count space in read buffers, but it isn't - invoked automatically in the current Homa call structure. - * skb_set_owner_r, sock_rfree: assist in managing sk_rmem_alloc - * nr_free_buffer_pages: appears to return info about total available - memory space, for autosizing buffer usage? - * sysctl_wmem_default: default write buffer space per socket. - * net.ipv4.tcp_mem[0]: if memory usage is below this, no pressure - [1]: start applying memory pressure at this level - [2]: maximum allowed memory usage - * net.ipv4.sysctl_tcp_wmem[0]: minimum sk_sndbuf for a socket - [1]: default sk_sndbuf - [2]: maximum allowable sk_sndbuf - * sk_memory_allocated_add, sk_memory_allocated_sub: keep track of memory - allocated for socket. - -* Leads still to follow for skbuff usage: - * Read sock_def_write_space, track variables used to wait for write space, - see how these are used. - * What's the meaning of SOCK_USE_WRITE_QUEUE in sock_wfree? - * Check out sock_alloc_send_pskb - * Check out skb_head_from_pool: allocate faster from processor-specific pool? - * Check out sk_forward_alloc - * Check out tcp_under_memory_pressure - * Check out sk_mem_charge - -* How buffer memory can accumulate in Homa: - * Incoming packets: messages not complete, or application doesn't read. - * Outgoing packets: receiver doesn't grant to us. - -* Possible remedies for memory congestion: - * Delete incoming messages that aren't active - * Delete incoming messages that application is ignoring - * Delete outgoing messages that aren't getting grants - * Stop receiving data from incoming messages (discard packets, send BUSY) - * Don't accept outbound data: stall in write, or reject - * Notes on timers: * hrtimers execute at irq level, not softirq * Functions to tell what level is current: in_irq(), in_softirq(), in_task() -* Detailed switches from normal module builds: -gcc -Wp,-MD,/home/ouster/remote/homaModule/.homa_plumbing.o.d -nostdinc -isystem /usr/lib/gcc/x86_64-linux-gnu/4.9/include -I./arch/x86/include -I./arch/x86/include/generated -I./include -I./arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I./include/uapi -I./include/generated/uapi -include ./include/linux/kconfig.h -D__KERNEL__ -DCONFIG_CC_STACKPROTECTOR -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -fshort-wchar -Werror-implicit-function-declaration -Wno-format-security -std=gnu89 -fno-PIE -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -m64 -falign-jumps=1 -falign-loops=1 -mno-80387 -mno-fp-ret-in-387 -mpreferred-stack-boundary=3 -mtune=generic -mno-red-zone -mcmodel=kernel -funit-at-a-time -DCONFIG_X86_X32_ABI -DCONFIG_AS_CFI=1 -DCONFIG_AS_CFI_SIGNAL_FRAME=1 -DCONFIG_AS_CFI_SECTIONS=1 -DCONFIG_AS_FXSAVEQ=1 -DCONFIG_AS_SSSE3=1 -DCONFIG_AS_CRC32=1 -DCONFIG_AS_AVX=1 -DCONFIG_AS_AVX2=1 -DCONFIG_AS_AVX512=1 -DCONFIG_AS_SHA1_NI=1 -DCONFIG_AS_SHA256_NI=1 -pipe -Wno-sign-compare -fno-asynchronous-unwind-tables -mindirect-branch=thunk-extern -mindirect-branch-register -DRETPOLINE -fno-delete-null-pointer-checks -O2 --param=allow-store-data-races=0 -DCC_HAVE_ASM_GOTO -Wframe-larger-than=2048 -fstack-protector -Wno-unused-but-set-variable -fno-var-tracking-assignments -g -pg -mfentry -DCC_USING_FENTRY -Wdeclaration-after-statement -Wno-pointer-sign -fno-strict-overflow -fno-merge-all-constants -fmerge-constants -fno-stack-check -fconserve-stack -Werror=implicit-int -Werror=strict-prototypes -Werror=date-time -DMODULE -DKBUILD_BASENAME='"homa_plumbing"' -DKBUILD_MODNAME='"homa"' -c -o /home/ouster/remote/homaModule/.tmp_homa_plumbing.o /home/ouster/remote/homaModule/homa_plumbing.c - ./tools/objtool/objtool orc generate --module --no-fp --retpoline "/home/ouster/remote/homaModule/.tmp_homa_plumbing.o" - -* TCP socket close: socket_file_ops in socket.c (.release) - -> sock_close -> sock_release -> proto_ops.release - -> inet_release (af_inet.c) -> sk->sk_prot->close - -> tcp_close (tcp.c) - -* How to pair requests and responses? - * Choice #1: extend addresses to include an RPC id: - * On client send, destination address has an id of 0; kernel fills in - correct id. - * On receive, the source address includes the RPC id (both client and server) - * On server send, destination address has a non-zero id (the one from - the receive): this is used to pair the response with a particular request. - Analysis: - * The RPC ID doesn't exactly fit as part of addresses, though it is close. - * Doesn't require a change in API. - * Can the kernel modify the address passed to sendmsg? What if the - application invokes write instead of sendmsg? - * Choice #2: perform sends and receives with an ioctl that can be used - to pass RPC ids. - Analysis: - * Results in what is effectively a new interface. - * Choice #3: put the RPC Id in the message at the beginning. The client - selects the id, not the kernel, but the kernel will interpret these - ids both on sends and receives. - Analysis: - * Awkward interaction between client and kernel, with the kernel - now interpreting what used to be just an uninterpreted blob of data. - * Will probably result in more application code to read and write - the ids; unclear that this can be hidden from app. - * Choice #4: define a new higher-level application API; it won't matter - what the underlying kernel calls are: - homa_send(fd, address, msg) -> id - homa_recv(fd, buffer) -> id, length, sender_address, is_request - homa_invoke(fd, address, request, response) -> response_length - homa_reply(fd, address, id, msg) - * Notes on managing network buffers: * tcp_sendmsg_locked (tcp.c) invokes sk_stream_alloc_skb, which returns 0 if memory running short. It this happens, it invokes sk_stream_wait_memory @@ -272,88 +274,10 @@ gcc -Wp,-MD,/home/ouster/remote/homaModule/.homa_plumbing.o.d -nostdinc -isyste * __sk_mem_raise_allocated is invoked from __sk_mem_schedule * __sk_mem_schedule is invoked from sk_wmem_schedule and sk_rmem_schedule -* Waiting for input in TCP: - * tcp_recvmsg (tcp.c) -> sk_wait_data (sock.c) - * Waits for a packet to arrive in sk->sk_receive_queue (loops) - * tcp_v4_rcv (tcp_ipv4.c) -> tcp_v4_do_rcv - -> tcp_rcv_established (tcp_input.c) -> sk->sk_data_ready - -> sock_def_readable (sock.c) - * Wakes up sk->sk_wq - -* Waiting for input in UDP: - * udp_recvmsg -> __skb_recv_udp -> __skb_wait_for_more_packets (datagram.c) - * Sleeps process with no loop - * udp_rcv -> __udp4_lib_rcv -> udp_queue_rcv_skb -> __udp_queue_rcv_skb - -> __udp_enqueue_schedule_skb -> sk->sk_data_ready - -> sock_def_readable (sock.c) - * Wakes up sk->sk_wq - -* Notes on waiting: - * sk_data_ready function looks like it will do most of the work for waking - up a sleeping process. sock_def_readable is the default implementation. - -* On send: - * Immediately copy message into sk_buffs. - * Client assigns message id; it's the first 8 bytes of the message data. - * Return before sending entire message. - * Homa keeps track of outstanding requests (some limit per socket?). - * If message fails, kernel must fabricate a response. Perhaps all - responses start with an id and a status? - -* Tables needed: - * All Homa sockets - * Used to assign new port numbers - * Used to dispatch incoming packets - * Need RCU or some other kind of locking? - * Outgoing RPCs (for a socket?) - * Used to find state for incoming packets - * Used for cleanup operations (socket closure, cancellation, etc.) - * Used for detecting timeouts - * No locks needed: use existing socket lock - * Or, have one table for all sockets? - * Outgoing requests that haven't yet been transmitted: - * For scheduling outbound traffic - * Must be global? - * Outgoing responses that haven't yet been transmitted: - * For scheduling outbound traffic - * Must be global? - * Incoming RPCs: - * Use to find state for incoming packets - * Miscellaneous information: * For raw sockets: "man 7 raw" * Per-cpu data structures: linux/percpu.h, percpu-defs.h -* API for applications - * Ideally, sends are asynchronous: - * The send returns before the message has been sent - * Data has been copied out of application-level buffers, so - buffers can be reused - * Must associate requests and responses: - * A response is different from a request. - * Kernel may need to keep track of open requests, so that it - can handle RESEND packets appropriately; what if application - doesn't respond, and an infinite backlog of open requests - builds up? Must limit the kernel state that accumulates. - * Maybe application must be involved in RESENDs? - * On receive, application must provide space for largest possible message - * Or, receives must take 2 system calls, one to get the size and - one to get the message. - * Support a polling API for incoming messages? - * Client provides buffer space in advance - * Kernel fills in data as packets arrive - * Client can poll memory to see when new messages arrive - * This would minimize sk_buff usage in the kernel - * Is there a way for the kernel to access client memory when - the process isn't active? - * Can buffer space get fragmented? For example, the first part of - a long message arrives, but the rest doesn't; meanwhile, buffers - fill up and wrap around. - * On receive, avoid copies of large message bodies? E.g., deliver only - header to the application, then it can come back later and request - that the body be copied to a particular spot. - * Provide a batching mechanism to avoid a kernel call for each message? - * What happens when a socket is closed? * socket.c:sock_close * socket.c:sock_release @@ -364,19 +288,6 @@ gcc -Wp,-MD,/home/ouster/remote/homaModule/.homa_plumbing.o.d -nostdinc -isyste * sock_orphan * sock_put (decrements ref count, frees) -* What happens in a connect syscall (UDP)? - * socket.c:sys_connect - * proto_ops.connect -> af_inet.c:inet_dgram_connect - * proto.connect -> datagram.c:ip4_datagram_connect - * datagram.c: __ip4_datagram_connect - -* What happens in a bind syscall (UDP)? - * socket.c:sys_bind - * proto_ops.bind -> afinet.c:inet_bind - * proto.bind -> (not defined for UDP) - * If no proto.bind handler, then a bunch of obscure -looking stuff - happens. - * What happens in a sendmsg syscall (UDP)? * socket.c:sys_sendmsg * socket.c:__sys_sendmsg diff --git a/perf.txt b/perf.txt index f26ab1cd..68e2f00c 100644 --- a/perf.txt +++ b/perf.txt @@ -2,6 +2,574 @@ This file contains various notes and lessons learned concerning performance of the Homa Linux kernel module. The notes are in reverse chronological order. +68. (January 2025) Performance snapshot with and without pacer, using +c6620 CloudLab nodes, "-w w4 -b 80 -s 20 -n 6". cp_vs_tcp is used unless +cp_both is indicated. + +AvgSlow: "avg slowdown" from cp_vs_tcp log output +Min: "min" from cp_vs_tcp "avg slowdown" line +P50: "P50" from cp_vs_tcp "avg slowdown" line +P99: "P99" from cp_vs_tcp "avg slowdown" line +P99L: P99 for 1 MB messages, from *_w4.data file +MaxT: Throughput under "-b100" + + AvgSlow Min P50 P99 P99L MaxT +Homa (old pacer) 3.33 22.1 50.9 98.5 3284 96.6 +homa (homa_qdisc) 3.31 21.1 50.6 90.7 3698 94.6 +Homa (cp_both, old_pacer) 4.56 23.6 56.4 379.6 4182 +homa (cp_both, homa_qdisc) 3.72 23.5 53.6 124.6 4021 +TCP (no homa_qdisc) 11.81 32.9 180.4 1271.6 5235 94.8 +TCP (homa_qdisc) 10.80 32.6 157.1 832.6 4627 95.7 +TCP (cp_both, old pacer) 9.22 31.4 151.6 839.2 3062 +TCP (cp_both, homa_qdisc) 9.13 32.9 136.5 762.5 4127 + +Summary: +* Without homa_qdisc, Homa P99 suffers a lot under cp_both; with homa_qdisc + it improves 3x, but is still 30% slower than running without TCP. +* homa_qdisc improves TCP performance even when running without Homa. +* TCP performance is better running with Homa than standalone. +* Homa_qdisc reduces Homa's maximum throughput slightly, increases TCP's + maximum throughput slightly. + +67. (January 2025) Performance variation over reboots. On the c6620 CloudLab +cluster, both Homa and TCP performance seems to vary from reboot to reboot. +Performance is relatively consistent between reboots. However, after +observing this phenomenon one day, it completely disappeared the next day +(reboots consistent result in "fast" behavior). There was a CloudLab datacenter +shutdown overnight... perhaps that somehow changed the behavior? + +Each line below represents one reboot of a c6620 cluster running this command: + +cp_vs_tcp -w w4 -b 80 -s 20 -l /ouster/logs/test -n 6 --skip 0 --tcp yes +--port-threads 3 --port-receivers 3 --client-ports 5 --server-ports 2 +--tcp-client-ports 10 --tcp-server-ports 20 + +The command was run once with the old pacer and once with homa_qdisc +enabled. In addition, for the "Both" measurements, cp_both was used +to run Homa and TCP simultaneously (with homa_qdisc enabled): + +cp_both -w w4 -b 80 -s 20 -l /ouster/logs/test -n 6 --skip 0 --homa-gbps 40 +--port-threads 3 --port-receivers 3 --client-ports 5 --server-ports 2 +--tcp-client-ports 10 --tcp-server-ports 20 + +Each measurement includes average slowdown and P99 short-message latency, +as printed by cp_vs_tcp on the "avg slowdown" line. + +Homa no qdisc TCP no qdisc Homa Qdisc Tcp Qdisc Homa Both TCP Both + Avg P99 Avg P99 Avg P99 Avg P99 Avg P99 Avg P99 +---------------------------------------------------------------------------- +3.30 97 11.68 1258 3.30 90 10.91 850 3.67 125 9.09 766 +3.30 97 11.73 1277 3.29 90 10.72 816 3.69 125 9.17 768 +3.30 97 11.62 1273 3.31 91 10.77 837 3.68 125 9.15 784 +3.30 98 11.71 1268 3.30 90 10.75 828 3.68 124 9.15 769 +3.31 99 11.64 1268 3.60 97 11.54 906 4.38 144 11.60 892 +3.34 101 11.71 1253 3.35 94 10.92 860 3.71 125 9.19 774 +3.85 135 12.40 1501 4.08 117 12.04 961 4.77 158 13.19 1003 +3.94 143 12.53 1555 3.92 107 11.87 961 5.12 204 14.28 1126 +4.20 255 12.86 1694 4.20 133 12.71 1053 + +The following experiments were run repeatedly without rebooting the nodes +(two different reboots separated by a blank line): + +Homa no qdisc TCP no qdisc Homa Qdisc Tcp Qdisc Homa Both TCP Both + Avg P99 Avg P99 Avg P99 Avg P99 Avg P99 Avg P99 +---------------------------------------------------------------------------- +3.30 97 11.73 1277 3.29 90 10.72 816 3.69 125 9.17 768 +3.30 97 11.74 1270 3.31 90 10.72 823 3.67 125 9.05 764 +3.30 97 11.68 1259 3.30 90 10.92 852 3.67 124 9.11 762 +3.30 97 11.68 1267 3.30 90 10.79 831 3.69 125 9.21 766 +3.29 97 11.74 1291 3.30 90 10.79 837 3.68 125 9.12 767 +3.29 97 11.75 1276 3.30 90 10.68 814 3.68 125 9.16 773 + +3.97 138 12.72 1599 4.06 110 12.33 1002 4.91 173 13.78 1083 +4.07 146 12.74 1632 4.03 112 12.04 966 4.72 162 13.49 1020 +4.09 150 12.63 1577 4.06 114 12.43 1013 5.05 181 14.41 1145 +4.08 148 12.59 1557 4.11 114 12.33 1000 5.02 177 14.39 1093 +4.17 180 12.65 1603 3.94 106 12.20 988 4.95 178 14.30 1130 +3.99 132 12.74 1629 4.05 111 12.25 989 5.01 172 14.01 1085 +4.02 143 12.53 1558 4.09 113 11.91 959 5.10 191 15.18 1208 +3.89 126 12.54 1590 4.28 120 12.08 981 5.14 182 14.07 1111 + +66. (January 2025) Evaluated benchmarking parameters for 100 Gbps networks +(c6620 CloudLab cluster). Overall, for W4 the best parameters for Homa are: + +--port-threads 3 --port-receivers 3 --client-ports 5 --server-ports 2 +('--client-ports 4 --server-ports 2' and '--client-ports 3 --server-ports 3' +are about the same) + +and for Tcp: + +--tcp-client-ports 10 --tcp-server-ports 20 + +Here are more detailed measurements: +Thr: --port-threads and --port-receivers +CPorts: --client-ports +SPorts: --server-ports +TcpCP: --tcp-client-ports +TcpSP: --tcp-server-ports +HomaS: Average slowdown for Homa +HomaP99: P99 latency for short messages for Homa (usecs) +TcpS: Average slowdown for TCP +TcpP99: P99 latency for short messages for TCP (usecs) + +Homa under cp_vs_tcp with homa_qdisc (c6620, 6 nodes): +Note: these measurements were taken with a "good" boot configuration +-w -b Thr CPorts SPorts HomaS HomaP99 +------------------------------------------- +w3 34 2 3 3 1.98 142 +w3 34 2 4 4 1.77 98 +w3 34 2 5 5 1.67 76 +w3 34 2 6 6 1.68 72 +w3 34 2 7 7 1.69 69 max tput (47.4 Gbps) +w3 34 2 8 8 1.70 69 + +w4 80 3 5 1 3.44 170 +w4 80 3 4 2 3.36 94 +w4 80 3 5 2 3.35 94 +w4 80 3 3 3 3.41 96 +w4 80 3 4 4 3.51 99 +w4 80 3 5 5 3.53 100 +w4 80 3 3 5 3.58 104 +w4 80 3 5 3 3.43 94 +w4 80 2 4 4 3.43 109 +w4 80 2 5 5 3.43 103 +w4 80 2 6 6 3.47 102 +w4 80 2 7 7 3.51 104 + +w5 80 2 6 4 8.04 177 +w5 80 3 4 2 7.74 136 +w5 80 3 3 3 8.25 141 +w5 80 3 4 4 8.42 141 + +TCP under cp_vs_tcp with homa_qdisc (c6620, 6 nodes): +Note: these measurements were taken with a "good" boot configuration +-w -b TcpCP TcpSP TcpS TcpP99 +------------------------------------ +w3 34 4 8 3.10 445 +w3 34 5 10 3.72 516 +w3 34 6 12 3.50 430 +w3 34 7 14 3.53 390 +w3 34 8 16 3.63 368 max tput (42.7 Gbps) +w3 34 9 18 3.80 361 + +w4 80 2 4 25.31 4040 +w4 80 3 6 13.83 1790 +w4 80 4 8 12.42 1536 +w4 80 5 10 12.23 1461 +w4 80 6 12 12.28 1342 +w4 80 7 14 11.68 1105 +w4 80 8 16 11.40 980 +w4 80 9 18 10.87 872 +w4 80 10 20 10.79 843 +w4 80 12 24 11.41 821 +w4 80 15 30 15.74 915 + +w5 80 6 12 16.00 1927 +w5 80 8 16 15.80 1866 +w5 80 10 20 15.27 1636 +w5 80 12 24 15.38 1478 + +Explored configuration for cp_both (c6620 cluster, -w w4 -b 80): +Note: these measurements were taken with a "good" boot configuration +(Hgbps is the --homa_gbps parameter) + +HGbps Thr CPorts SPorts TcpCP TcpSP HomaS HomaP99 TcpS TcpP99 +---------------------------------------------------------------------- +5 3 1 1 8 16 5.02 268 10.83 929 +5 3 2 2 10 20 4.54 205 10.50 842 +5 3 3 3 12 24 4.45 179 11.07 818 +5 3 4 2 10 15 4.23 173 10.23 892 +5 3 5 2 12 12 3.95 136 10.18 932 +5 3 6 2 8 24 4.36 181 10.30 830 +5 2 4 2 16 16 4.02 148 10.52 856 +5 2 5 2 16 20 4.32 182 11.40 844 + +20 3 4 2 10 20 3.93 154 10.14 828 +20 3 5 2 10 20 3.86 146 10.06 815 +20 3 5 3 10 20 3.89 145 10.13 824 + +40 3 5 2 10 20 3.71 125 9.19 774 +40 3 5 3 10 20 3.71 124 9.21 763 +40 3 5 3 8 16 3.72 126 8.99 794 + +60 3 4 2 10 20 3.48 106 7.60 643 +60 3 5 2 10 20 3.47 104 7.62 647 +60 3 5 3 10 20 3.48 103 7.60 635 + +75 3 4 2 10 20 3.27 94 7.38 560 +75 3 5 2 10 20 3.24 93 7.36 553 +75 3 5 3 3 6 3.42 93 7.04 602 +75 3 5 3 8 16 3.31 91 7.21 565 + +65. (December 2025) The pacer does not prevent NIC queue buildup. Under +"-w w4 -b 80" on cc620 machines (Intel NICs) it is not unusual to see +periods of 1ms or longer with more than 500 Kbytes of packet data queued +in the NIC. This happens because the NIC cannot always sustain 100 Gbps of +output. Even with large amounts of queued data, the NIC completion rate varies +between 85 and 100 Gbps. Since the pacer will queue data at almost 100 Gbps, +the NIC queue builds when there is a large backlog of data; over time, the +queue for data tends to move from the pacer to the NIC. The pacer limit rate +would have to be reduced considerably to eliminate this problem (e.g., 85 Gbps +instead of 99 Gbps?) but that would waste a lot of NIC bandwidth since there +are many times when the NIC can transmit at nearly 100 Gbps. + +64. (November 2025) Separating pacer traffic from non-paced traffic in +homa_qdisc (use tx queue 0 for paced traffic; non-paced traffic is spread +across other queues, using default queues except that traffic for queue 0 +goes to queue 1 instead). In comparison to the old pacer (measurements +with w4 and w5 on c6620 cluster at 80 Gbps load; see log book for graphs): +* P99 for messages shorter than defer_min_bytes is 20-30% faster with separation +* P99 for messages between defer_min_bytes and unsched_limit is about 2x + slower with separation +* P99 for messages longer than unsched_limit starts off 40-50% slower with + separate, but gradually converges. +* Increasing defer_min_bytes provides upside with no apparent downside. +* Average slowdowns are better with the old pacer: 3.45 vs. 3.77 for W4, + 9.40 vs. 7.72 for W5 (W5 has no messages shorter than defer_min_bytes). +* It appears that Intel NICs cannot always transmit at full link bandwidth, + so some some queuing occurs in the NIC even with Homa's output + pacing. +* When packets build up in the NIC, it appears to use some sort of fair + sharing mechanism between the queues. By placing a disproportionate + share of outgoing bytes in a single queue, those bytes effectively get + lower priority and bytes in other queues get higher priority, which + explains the behaviors observed above. +* Overall, it appears that placing pacer traffic in a dedicated queue is + not a good idea. + +63. (September 2025) Compared CPU utilization against TCP. Measured with +top, running cp_vs_tcp -w w4 -b20 on a 6-node xl170 cluster (20 cores): + + Homa TCP Homa no polling +us (user) 9.8 15.7 11.0 +sy (system) 31.5 11.8 17.3 +ni (nice) 0.0 0.0 0.0 +id (idle) 38.0 49.2 51.9 +wa (iowait) 0.0 0.0 0.0 +hi (hardware interrupts) 0.0 0.0 0.0 +si (software interrupts) 19.3 22.2 19.5 +st (hypervisor steal) 0.0 0.0 0.0 + +Without polling, Homa's CPU utilization is slightly lower than TCP's. +Polling costs an extra 2-3 cores for Homa. + +62. (August 2025) Using ktime_get_ns (rdtscp) instead of get_cycles (rdtsc) +in homa_clock (Linux reviewers won't allow get_cycles for upstreaming). +rdtscp takes about 14 ns per call, vs. 8 for ktime_get_ns. Running "w4 -b20" +on xl170s homa_clock invocations are 21 M/sec, so expect about .12 additional +core to be used. Measurements on xl170 cluster (25 Gbps network) using "w4 -b20" +(average across 6 nodes in experiment, then average over 5 runs): + rdtsc rdtscp Ratio +Gbps/sec/core: 6.46 6.22 0.954 +Total core utilization: 6.20 6.44 1.038 + +Same experiment but in overload ("w4 -b40"): + rdtsc rdtscp Ratio +Gbps/sec/core: 5.44 5.32 0.980 +Total core utilization: 8.08 8.05 0.997 +Maximum throughput (Gbps): 21.95 21.42 0.976 + +61. (July 2025) Client responses could starve server requests. This came +about because a server request that wakes up after waiting for buffer space +has 0 received bytes. In contrast, a new client response will have received +unscheduled bytes. As a result, the client responses always got priority for +new grants and server requests could starve. The solution was to grant server +requests an amount equal to the unscheduled bytes when they wake up after +qwaiting for buffer space. + +60. (July 2025) Measured impact of new FIFO grant mechanism on xl170 +cluster using "-w starve -b 40 -s 30 -n 6" (priorities were not enabled). +Slowdowns as a function of message length: + + grant_fifo_fraction = 0 grant_fifo_fraction = 50 grant_fifo_fraction = 100 +# length s50 s99 s999 s50 s99 s999 s50 s99 s999 + 100000 13.7 25.5 86.8 13.3 21.7 31.9 13.2 22.2 32.7 + 200000 13.0 32.2 75.7 12.7 21.2 29.0 12.6 21.5 30.7 + 300000 13.4 30.2 64.5 13.1 22.1 28.2 13.0 22.7 30.2 + 400000 14.3 30.9 60.1 14.0 24.5 30.6 14.1 25.9 33.3 + 500000 16.1 35.0 83.0 15.9 30.5 37.4 16.4 32.8 41.6 + 600000 19.0 49.3 185.7 19.5 41.2 53.1 20.8 47.7 62.2 + 700000 24.1 70.1 222.0 26.7 67.8 91.4 30.8 88.6 122.2 + 800000 34.8 121.2 282.6 47.5 178.9 268.4 67.9 315.6 470.3 + 900000 72.6 307.5 470.5 1155.3 2139.8 2314.1 1477.2 1746.0 1823.7 + 1000000 3093.4 12063.2 13050.8 1982.2 2354.0 2482.9 1467.0 1647.1 1709.4 + +Even shorter messages seem to benefit from the FIFO mechanism (not sure why...). +Increasing the FIFO fraction from 5% to 10% doesn't make much difference and +starts to penalize smaller messages more. + +FIFO also helps even when the cluster isn't overloaded: slowdown at +"-w starve -b 20 -s 30 -n 6": + + grant_fifo_fraction = 0 grant_fifo_fraction = 50 +# length s50 s99 s999 s50 s99 s999 + 100000 11.0 20.0 27.4 11.1 20.0 28.1 + 200000 10.5 19.5 26.0 10.5 19.6 26.1 + 300000 10.6 20.4 25.7 10.7 20.8 25.8 + 400000 11.1 22.0 26.8 11.2 22.7 28.5 + 500000 11.8 24.6 31.3 12.1 26.9 35.4 + 600000 13.0 30.3 39.2 13.4 33.5 45.9 + 700000 14.6 39.4 53.5 15.2 46.9 67.8 + 800000 16.9 55.3 82.7 17.6 63.8 92.7 + 900000 20.2 93.6 147.9 20.8 80.5 112.2 + 1000000 23.4 155.6 250.4 23.3 93.3 128.7 + +When the cluster isn't overloaded, short messages get a bit worse when FIFO +is enabled. + +59. (May 2025) Measured overhead to read various clocks on 2.4 GHz +Xeon E5-2640 (note: measured when CPU is active, hence running in fastest +mode): + +Function Units Overhead +----------------------------------------------- +rdtsc cycles 8 ns +rdtscp cycles 14 ns +sched_clock ns 9 ns +ktime_get_mono_fast_ns ns 24 ns +ktime_get_raw_fast_ns ns 24 ns + +58. (September 2024): Interference between Homa and TCP when both run + concurrently on the same nodes (no special kernel code to mitigate + interference) + Experiment on xl170 cluster: + cp_both -n 9 --skip 0 -w w4 -b 20 -s 30 + + HomaGbps: Gbps generated by Homa (20 - HomaGbps generated by TCP) + HAvg: Average slowdown for Homa + HP50: Median RTT for Homa short messages + HP99: P99 RTT for Homa short messages + TAvg: Average slowdown for TCP + TP50: Median RTT for TCP short messages + TP99: P99 RTT for TCP short messagesAvailable + + HomaGbps HAvg HP50 HP99 TAvg TP50 TP99 + 0 63.4 797 6089 + 2 8.1 66 335 80.5 1012 10131 + 4 8.6 65 507 80.0 1021 9315 + 6 9.9 66 765 80.8 1022 9328 + 8 12.1 68 1065 79.8 1042 8309 + 10 14.3 70 1324 76.7 993 6881 + 12 15.1 72 1394 73.4 971 5866 + 14 14.8 75 1305 73.1 927 6076 + 16 12.9 75 1077 70.2 816 6564 + 18 10.0 70 755 69.7 748 7387 + 20 4.4 44 119 + + Overall observations: + * Short messages: + * Homa: 2x for P50, 10x increase for P99, 2x for P50 + * TCP: 25% increase for P50, 10% increase for P99 + * The TCP degradation is caused by Homa using priorities. If the + experiment is run without priorities for Homa, TCP's short-message + latencies are significantly better than TCP by itself: 571 us for P50, + 3835 us for P99. + * Long messages: + * TCP P50 and P99 latency drop by up to 40% as Homa traffic share + increases (perhaps because Homa throttles itself to link speed?) + * Running Homa without priorities improves TCP even more (2x gain for TCP + P50 and P99 under even traffic split, relative to TCP alone) + * Homa latency not much affected + * Other workloads: + * W5 similar to W4 + * W3 and W2 show less Homa degradation, more TCP degradation + * Estimated NIC queue lengths have gotten much longer (e.g P99 queueing + delay of 235-750 us now, vs. < 10 us when Homa runs alone) + * Homa packets are experiencing even longer delays than this because + packets aren't distributed evenly across tx queues, while the NIC serves + queues evenly. + +57. (August 2024): Best known parameters for c6525-100g cluster: + Homa: + hijack_tcp=1 .unsched_bytes=20000 window=0 max_incoming=1000000 + gro_policy=0xe2 throttle_min_bytes=1000 + --client-ports 4 --port-receivers 6 --server-ports 4 --port-threads 6 + TCP: + --tcp-client-ports 4 --tcp-server-ports 6 + +56. (August 2024): Performance challenges with c6525-100g cluster (AMD CPUs, + 100 Gbps links): + * The highest achievable throughput for Homa with W4 is 72-75 Gbps. + * TCP can get 78-79 Gbps with W4. + * The bottleneck is NIC packet transmission: 1 MB or more of data can + accumulate in NIC queues, and data can be queued in the NIC for 1 ms + or more. + * Memory bandwidth appears to be the limiting factor (not, say, + per-packet overheads for mapping addresses). For example, W2 can + transmit more packets than W4 without any problem. + * NIC queue buildup is not even across output queues. The queue used by + the pacer has significantly more buildup than the other queues. This + suggests that the NIC services queues in round-robin order. The pacer + queue gets a large fraction of all outbound traffic but it receives + only a 1/Nth share of the NIC's output bandwidth, so when the NIC can't + keep up, packets accumulate primarily in this one queue. + * Priorities don't make a significant difference in latency! It appears + that the NIC queuing issue is the primary contribution to P99 latency + even for short messages (too short to use the pacer). This is evident + because not only do P99 packets take a long time to reach the receiver's + GRO, they also take a long time to get returned to the sender to be + freed; this suggests that they are waiting a long time to get + transmitted. Perhaps the P99 packets are using the same output queue + as the pacer? + * Even at relatively low throughputs (e.g. 40 Gbps), P99 latency still + seems to be caused by slow NIC transmission, not incast queueing. + * Increasing throttle_min_bytes improves latency significantly, because + packets transmitted by the pacer are much more likely to experience + high NIC delays. + +55. (June/July 2024): Reworked retry mechanism to retry more agressively. + Introduced ooo_window_usecs sysctl parameter with an initial value of + 100 us; retry gaps once they reach this age. However, this increased the + number of resent packets by 20x and reduced throughput as well. + Hypothesis: many packets suffer quite long delays but eventually get + through; with fast retries, these get resent unnecessarily. Tried + increasing the value of ooo_window_usecs, and this helped a bit, but + performance is best if retries only happen when homa_timer hits its + resend_ticks value. So, backed out support for ooo_window_usecs. + +54. (June 2024): New sk_buff allocation mechanism. Up until now, Homa + allocated an entire tx sk_buff with alloc_skb: both the packet header + and the packet data were allocated in the head. However, this resulted + in high overheads for sk_buff allocation. Introduced a new mechanism + (in homa_skb.c) for tx sk_buffs, where only the packet header is in the + head. The data for data packets is allocated using frags and high-order + pages (currently 64 KB). In addition, when sk_buffs are freed, Homa + saves the pages in pools (one per NUMA node) to eliminate the overhead + of page allocation. Here are before/after measurements taken with the + W4 workload on a 9-node c6525-100g cluster: + Before After + Avg. time to allocate sk_buff 7-9 us 0.85 us + Cores spent in sk_buff alloc 3.6-4.5 0.4-0.5 + Cores spent in kfree_skb 1.1-1.3 0.3-0.4 + Goodput/core 5.9-7.2 Gbps 8.4-10 Gbps + Time to allocate page 12 us + Cores spent allocating pages 0.04-0.08 + +53. (May 2024; superceded by #56) Strange NIC behavior (observed with Mellanox + ConnectX5 NICs on the c6525-100g CloudLab cluster, using W4 with offerred + load 80 Gbps and actual throughput more like 60 Gbps). + * The NIC is not returning tx packets to the host promptly after + transmission. In one set of traces (W4 at 80% offered load), 20% of + all packets weren't freed until at least 50 us after the packets had + been received by the target GRO; P99 delay was 400 us, and some packets + were delayed more than 1 ms. Note: other traces are not as bad, but + still show significant delays (15-20% of delays are at least 50 usec, + worst delays range from 250 us - 1100 us). + * Long delays in returning tx packets cause Linux to stop the tx queue + (it has a limit on outstanding bytes on a given channel), which slows + down transmission. + * The NIC doesn't seem to be able to transmit packets at 100 Gbps. + Many packets seem not to be transmitted for long periods of time (up to + 1-2 ms) after they are added to a NIC queue: both the time until GRO + receipt and time until packet free are very long. Different tx queues + experience different delays: the delays for one queue can be short at + the same time that delays for another queue are very long. These problems + occur when Homa is passing packets to the NIC at < 100 Gbps. + * The NIC is not transmitting packets from different tx queues in a FIFO + order; it seems to be favoring some tx queues (perhaps it is + round-robining so queues with more traffic get treated badly?). + +52. (February 2024) Impact of core allocation. Benchmark setup: 2 nodes, + c6525-100g cluster (100 Gbps network, 48 hyperthreads, 24 cores, 3 cores + per chiplet?): + cp_node server --pin N + cp_node client ----workload 500000 --one-way --client-max 1 + window=0 max_incoming=2500000 gro_policy=16 unsched_bytes=50000 + Measured RPC throughput and copy_to_user throughput: + + --pin Gbps Copy + 0 17.7 33.4 + 3 18.9 32.2 + 6 19.0 34.3 + 8 18.8 34.1 + 9 22.2 54.2 + 10 25.7 53.2 + 11 26.3 55.1 + 12 17.9 31.7 + 13 18.2 31.6 + 15 17.9 31.5 + 18 18.2 32.3 + 21 18.1 32.4 + 32 18.6 34.0 + 33 24.8 54.0 + 34 25.9 54.5 + 35 26.3 54.5 + 36 17.7 31.5 + +51. (February 2024) RPC lock preemption. When SoftIRQ is processing a large + batch of packets for a single RPC, it was holding the RPC lock continuously. + This prevented homa_copy_to_user from acquiring the lock to extract the + next batch of packets to copy. Since homa_copy_to_user is the bottleneck + for large messages on 100 Gbps networks, this can potentially affect + throughput. Fixed by introducing APP_NEEDS_LOCK for RPCs, so that + SoftIRQ releases the lock temporarily if homa_copy_to_user needs it. + This may have improved throughput for W4 on c6525-100g cluster by 10%, + but it's very difficult to measure accurately. + +50. (February 2024) Don't queue IPIs. Discovered that when homa_gro_receive + invokes netif_receive_skb (intending to push a batch of packets through + to SoftIRQ ASAP), Linux doesn't immediately send an interprocessor + interrupt (IPI). It just queues the pending IPI until all NAPI processing + is finished, then issues all of the queued IPIs. This results in + significant delay for the first batch when NAPI has lots of additional + packets to process. Fixed this by writing homa_send_ipis and invoking it + in homa_gro_receive after calling netif_receive_skb. In 2-node tests + with "cp_node client --workload 500000 --client-max 1 --one-way" + (c6525-100g cluster), this improved latency from RPC start to beginning + copy to user space from 79 us to 46 us, resulting in 10-20% improvement + in throughput. W4 throughput appears to have improved about 10% (but a bit + had to measure precisely). + +49. (November 2023) Implemented "Gen3" load balancing scheme, renamed the + old scheme "Gen2". For details on load balancing, see balance.txt. + Gen3 seems to reduce significantly tail latency for cross-core handoffs; + here are a few samples(us): + --Gen2 P50- ---Gen2 P99--- --Gen3 P50- --Gen3 P99- + GRO -> SoftIRQ 2.7 2.8 3.0 71.1 43.6 71.3 2.8 2.6 2.7 8.7 5.4 8.3 + SoftIRQ -> App 0.3 0.3 0.3 20.5 21.7 19.9 0.3 0.3 0.3 7.2 6.8 9.0 + + However, this doesn't seem to translate into better overall performance: + standard slowdown graphs look about the same with Gen2 and Gen3 (Gen2 has + better P99 latency for W2 and W3; Gen3 is better for W5). This needs more + analysis. + +48. (August 2023) Unexpected packet loss on c6525-100g cluster (AMD processors, + 100 Gbps links). Under some conditions (such as "cp_node client --one-way + --workload 1000000" with dynamic_windows=1 and unsched_bytes=50000) + messages suffer packet losses starting around offset 700000 and + continuing intermittently until the end of the message. I was unable + to identify a cause, but increasing the size of the Mellanox driver's + page cache (MLX5E_CACHE_SIZE, see item 46 below) seems to make the problem + go away. Slight configuration changes, such as unsched_bytes=200000 also + make the problem go away. + +47. (July 2023) Intel vs. AMD processors. 100B roundtrips under best-case + conditions are about 8.7 us slower on AMD processors than Intel: + xl170: 14.5 us + c6525-100g: 23.2 us + Places where c6525-100g is slower (each way): + Packet prep (Homa): 1.2 us + IP stack and driver : 0.9 us + Network (interrupts?): 1.7 us + Thread wakeup: 0.6 us + TCP is also slower on AMD: 38.7 us vs. 23.3 us + Note: results on AMD are particularly sensitive to core placement of + various components. + +46. (July 2023) MLX buffer issues on c6525-100g cluster. The Mellanox + driver is configured with 256 pages (1 MB) of receive buffer space + for each channel. With a 100 Gbps network, this is about 80 us of + time. However, a single thread can copy data from buffers to user space + at only about 40 Gbps, which means that with longer messages, the + copy gets behind and packet lifetimes increase: with 1 MB messages, + median lifetime is 77 us and P90 lifetime (i.e. the later packets in + messages) are 115 us. With multiple messages from one host to another, + the buffer cache is running dry. When this happens, the Mellanox driver + allocates (and eventually frees) additional buffers, which adds + significant overhead. Bottom line: it's essential to use multiple + channels to keep up with a 100 Gbps network (this provides a larger + total buffer pool, plus more threads to copy to user space). + 45. (January 2023) Up until now, output messages had to be completely copied into sk_buffs before transmission could begin. Modified Homa to pipeline the copy from user space with packet transmission. This makes a significant @@ -85,7 +653,7 @@ order. messages at low load 41. (December 2022) More analysis of SMI interrupts. Wrote smi.cc to gather - data on vents that cause all cores to stop simultaneously. Found 3 distinct + data on events that cause all cores to stop simultaneously. Found 3 distinct kinds of gaps on xl170 (Intel) CPUs: * 2.5 usec gaps every 4 ms * 17 usec gaps every 10 ms (however, these don't seem to be consistent: @@ -95,7 +663,7 @@ order. gaps every 4 ms be scheduler wakeups?) 40. (December 2022) NAPI can't process incoming jumbo frames at line rate - for 100 Gbps network (AMD CPUs): it takes at 850 ns to process each + for 100 Gbps network (AMD CPUs): it takes about 850 ns to process each packet (median), but packets are arriving every 700 ns. Most of the time is spent in __alloc_skb in two places: kmalloc_reserve for data: 370 ns @@ -120,7 +688,7 @@ order. 38. (December 2022) Restructured the receive buffer mechanism to mitigate the page_pool_alloc_pages_slow problem (see August 2022 below); packets - can now be copied to user space and their buffers release without waiting + can now be copied to user space and their buffers released without waiting for the entire message to be received. This has a significant impact on throughput. For "cp_node --one-way --client-max 4 --ports 1 --server-ports 1 --port-threads 8" on the c6525-100g cluster: @@ -307,8 +875,8 @@ TCP P50 108.8 (3.5x) 192.7 (4.6x) 353.1 (7.5x) 385.7 (6.9x) largest messages with modified W4 workload. gro_policy: NORMAL always better; others 10-25% worse for short P99 max_gro_skbs: Larger is better; reducing to 5 hurts short P99 10-15%. - However, anecdotal experience suggests that very large - values can cause long delays for things like sending + However, anecdotal experience suggests that very large + values can cause long delays for things like sending grants, so perhaps 10 is best? max_gso_size: 10K looks best; not much difference above that, 10-20% degradation of short P99 at 5K @@ -323,8 +891,10 @@ TCP P50 108.8 (3.5x) 192.7 (4.6x) 353.1 (7.5x) 385.7 (6.9x) lower short P99 for W2 and W3 than 4 receivers, but for W5 3 receivers is 10% better than 2. Best choice: 3p2r? rtt_bytes: 60K is best, but not much sensitivity: 40K is < 10% worse - throttle_bytes: Almost no noticeable difference from 100-2000; perhaps - 500 or 1000? + throttle_bytes: Almost no noticeable difference from 100-2000; 100 is + probably best because it includes more traffic in the + computation of NIC queue length, reducing the probability + of queue buildup 29. (October 2020) Polling performance impact. In isolation, polling saves about 4 us RTT per RPC. In the workloads, it reduces short-message P50 @@ -460,7 +1030,7 @@ Event Median so as not to interfere with user threads. This sometimes means it has to wait for a full time slice for other threads, which seems to be 5-7 ms. I tried disabling this feature of __do_softirq, so that all requests get - processed in the high-priority thread, and the P999 latency improved by + processed in the high-priority thread, and the P9999 latency improved by about 10x (< 1 ms worst case). 18. (July 2020) Small-message latency. The best-case RTT for small messages @@ -625,7 +1195,7 @@ Event Median 1. Without RPS enabled, Homa performance is limited by a single core handling all softirq actions. In order for RPS to work well, Homa must implement its own hash function for mapping packets to cores (the default IP hasher - doesn't know about Homa ports, so it considers only the peer IP address. + doesn't know about Homa ports, so it considers only the peer IP address). However, with RPS, packets can get spread out over too many cores, which causes poor latency when there is a single client and the server is underloaded. diff --git a/perf/plot_length_cdf.py b/perf/plot_length_cdf.py index 673b3121..cdcf8935 100755 --- a/perf/plot_length_cdf.py +++ b/perf/plot_length_cdf.py @@ -1,18 +1,7 @@ #!/usr/bin/python3 -# Copyright (c) 2019-2020, Stanford University -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2019-2020 Homa Developers +# SPDX-License-Identifier: BSD-1-Clause # This script generates a plot showing the CDF of message lengths, # gathered from one or more experiment runs. diff --git a/protocol.md b/protocol.md index 42931927..6f9313e3 100644 --- a/protocol.md +++ b/protocol.md @@ -254,7 +254,7 @@ which violates SRPT. To prevent this problem, Homa employs a *pacer* mechanism. Homa maintains a running estimate of how many bytes have been passed to the NIC but not yet transmitted (the *NIC backlog*). If this exceeds a -threshold value (specified in units of time with the `max_nic_queue_ns` +threshold value (specified in units of time with the `max_nic_est_backlog_usecs` parameter) then no more packets will be transmitted until the NIC backlog drops below the limit. Homa maintains a *throttled list*, which contains outgoing messages that have packets ready to transmit. diff --git a/reap.txt b/reap.txt deleted file mode 100644 index 3b45415d..00000000 --- a/reap.txt +++ /dev/null @@ -1,52 +0,0 @@ -This file discusses issues related to freeing resources for completed RPCs -("reaping"). - -* Most of the cost of reaping comes from freeing skbuffs; this can be - quite expensive for RPCs with long messages. - -* The natural time to reap is when homa_rpc_free is invoked to mark an - RPC completed, but this can result in severe performance hiccups. For - example, a server RPC is freed once the last packet of the response - has been transmitted, but this can happen in homa_softirq in response - to an incoming grant, and there may be other short messages waiting - to be processed. Freeing a long RPC could result in significant delay - for a subsequent short RPC. - -* Thus Homa doesn't reap immediately in homa_rpc_free. Instead, dead RPCs - are queued up and reaping occurs later, at a more convenient time where - it is less likely to impact latency. The challenge is to figure out how to - do this so that (a) we keep up with dead RPCs and (b) we minimize - the impact of reaping on latency. - -* The ideal time to reap is when threads are waiting for incoming messages - in homa_wait_for_message. The thread has nothing else to do, so reaping - can be performed with no latency impact on the application. However, - if a machine is overloaded then it may never wait, so this mechanism - isn't always sufficient. - -* Homa now reaps in two other places, if homa_wait_for_message can't - keep up: - * If dead_buffs_limit dead skbs accumulate, then homa_timer will - reap to get down to that limit. However, it seems possible that - there may be cases where a single thread cannot keep up with all - the reaping to be done. - * If homa_timer can't keep up, then as a last resort, homa_pkt_dispatch - will reap a few buffers for every incoming data packet. This is undesirable - because it will impact Homa's performance. - -* In addition, during the conversion to the new input buffering scheme for 2.0, - freeing of packets for incoming messages was moved to homa_copy_to_user, - under the assumption that this code wouldn't be on the critical path. - However, right now the packet freeing is taking 20-25% of the total - time in that function, and with faster networks it's quite possible that - this code will indeed be on the critical path. So, it probably shouldn't - be doing packet freeing after all. - -* Here are some approaches that have been tried and eventually abandoned: - * Occasionally when data packets arrive, reap if too much dead info has - accumulated. This will cause a latency impact. The amount to reap is - chosen dynamically (by homa_timer) to be as small as possible while - gradually working through the backlog. Unfortunately, the formula for - computing how much to reap was fragile and resulted in situations where - the backlog of dead RPCs grew without bound. This approach was abandoned - in October 2021. diff --git a/rsync-exclude.txt b/rsync-exclude.txt index 5a509b2d..8a8bf776 100644 --- a/rsync-exclude.txt +++ b/rsync-exclude.txt @@ -4,7 +4,9 @@ nbproject private cloudlab +patches reports +*traces* __pycache__ *.data *.pyc diff --git a/sync.txt b/sync.txt deleted file mode 100644 index e97cf5db..00000000 --- a/sync.txt +++ /dev/null @@ -1,75 +0,0 @@ -This file describes the synchronization strategy used for Homa. - -* In the Linux TCP/IP stack, the primary locking mechanism is a lock - per socket. However, per-socket locks aren't adequate for Homa, because - sockets are "larger" in Homa. In TCP, a socket corresponds to a single - connection between the source and destination; an application can have - hundreds or thousands of sockets open at once, so per-socket locks leave - lots of opportunities for concurrency. With Homa, a single socket can be - used for communicating with any number of peers, so there will typically - be no more than one socket per thread. As a result, a single Homa socket - must support many concurrent RPCs efficiently, and a per-socket lock would - create a bottleneck (Homa tried this approach initially). - -* Thus, the primary lock used in Homa is a per-RPC spinlock. This allows operations - on different RPCs to proceed concurrently. RPC locks are actually stored in - the hash table buckets used to look them up. This is important because it - makes looking up RPCs and locking them atomic. Without this approach it - is possible that an RPC could get deleted after it was looked up but before - it was locked. - -* Certain operations are not permitted while holding spinlocks, such as memory - allocation and copying data to/from user space (spinlocks disable - interrupts, so the holder must not block). RPC locks are spinlocks, - and that results in awkward code in several places to move prohibited - operations outside the locked regions. In particular, there is extra - complexity to make sure that RPCs are not garbage-collected while these - operations are occurring without a lock. - -* There are several other locks in Homa besides RPC locks. When multiple - locks are held, they must always be acquired in a consistent order, in - order to prevent deadlock. For each lock, here are the other locks that - may be acquired while holding the given lock. - * RPC: socket, grantable, throttle, peer->ack_lock - * Socket: port_map.write_lock - * Peertab: none - * peer->ack_lock: none - * Grantable: none - * Throttle: none - * Metrics: none - * port_map.write_lock: none - -* Homa's approach means that socket shutdown and deletion can potentially - occur while operations are underway that hold RPC locks but not the socket - lock. This creates several potential problems: - * A socket might be deleted and its memory reclaimed while an RPC still - has access to it. Home assumes that Linux will prevent socket deletion - while the kernel call is executing. In situations outside kernel call - handling, Homa uses rcu_read_lock to prevent socket deletion. - * A socket might be shut down while there are active operations on - RPCs. For example, a new RPC creation might be underway when a socket - is shut down, which could add the new RPC after all of its RPCs - have supposedly been deleted. Handling this requires careful ordering - of operations during shutdown, plus the rest of Homa must be careful - never to add new RPCs to a socket that has been shut down. - -* There are a few places where Homa needs to scan all of the active RPCs - for a socket, such as the timer. Such code will lock each RPC that it - finds, but there is a risk that an RPC could be deleted and its memory - recycled before it can be locked; this could result in corruption. Locking - the socket for the duration of the scan would prevent this problem, but - that isn't possible because of the locking order constraints. It's OK if - the RPC gets deleted, as long as its memory doesn't get reclaimed. The - RCU mechanism could be used for this, but RCU results in *very* long delays - before final reclamation (tens of ms), even without contention, which means - that a large number of dead RPCs could accumulate. Thus I decided not to use - the Linux RCU mechanism. Instead, Homa has a special-purpose RCU-like - mechanism via the function homa_protect_rpcs; this function prevents RPC - reaping for a socket. RPCs can still be deleted, but their memory won't go - away until homa_unprotect_rpcs is invoked. - -* There are also a few places where Homa is doing something related to an - RPC (such as copying message data to user space) and needs the RPC to stay - around, but it isn't holding the RPC lock. In this situations, Homa sets - a bit in rpc->flags and homa_rpc_reap will not reap RPCs with any of these - flags set. \ No newline at end of file diff --git a/test/Makefile b/test/Makefile index 4a52f140..eaef3ef1 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,25 +1,16 @@ # Makefile to run unit tests for Homa -KDIR ?= /lib/modules/$(shell uname -r)/build +LINUX_VERSION ?= $(shell uname -r) +KDIR ?= /lib/modules/$(LINUX_VERSION)/build +LINUX_SRC_DIR ?= /ouster/linux-stable CC ?= gcc CXX ?= g++ PERL ?= perl ARCH ?= x86 -CINCLUDES := -I. \ - -I.. \ - -I$(KDIR)/arch/x86/include \ - -I$(KDIR)/arch/x86/include/generated \ - -I$(KDIR)/include \ - -I$(KDIR)/arch/x86/include/uapi \ - -I$(KDIR)/arch/x86/include/generated/uapi \ - -I$(KDIR)/include/uapi \ - -I$(KDIR)/include/generated/uapi \ - -include $(KDIR)/include/linux/kconfig.h \ - -include $(KDIR)/include/linux/compiler-version.h \ - -include $(KDIR)/include/linux/compiler_types.h -CCINCLUDES := -I. \ - -I.. \ +all: test + +KERN_INCLUDES := \ -I$(KDIR)/arch/x86/include \ -I$(KDIR)/arch/x86/include/generated \ -I$(KDIR)/include \ @@ -27,54 +18,81 @@ CCINCLUDES := -I. \ -I$(KDIR)/arch/x86/include/generated/uapi \ -I$(KDIR)/include/uapi \ -I$(KDIR)/include/generated/uapi +CINCLUDES := \ + -I. \ + -I.. \ + $(KERN_INCLUDES) \ + -include $(KDIR)/include/linux/kconfig.h +CCINCLUDES := \ + -I. \ + -I.. \ + $(KERN_KINCLUDES) DEFS := -D__KERNEL__ \ -D__UNIT_TEST__ \ -D KBUILD_MODNAME='"homa"' +ifneq ($(__STRIP__),) +DEFS += -D__STRIP__ +endif -WARNS := -Wall -Wundef -Wno-trigraphs -Wno-sign-compare \ - -Wno-strict-aliasing -Werror -CFLAGS := $(WARNS) -Wstrict-prototypes -MD -g $(CINCLUDES) $(DEFS) +WARNS := -Wall -Wundef -Wno-trigraphs -Wno-sign-compare -Wuninitialized \ + -Wno-strict-aliasing -Wunused-but-set-variable -Werror +CFLAGS := $(WARNS) -Wstrict-prototypes -MD -no-pie -g $(CINCLUDES) $(DEFS) CCFLAGS := -std=c++11 $(WARNS) -MD -g $(CCINCLUDES) $(DEFS) -fsanitize=address TEST_SRCS := unit_homa_incoming.c \ - unit_homa_lcache.c \ - unit_homa_offload.c \ + unit_homa_interest.c \ unit_homa_outgoing.c \ - unit_homa_peertab.c \ + unit_homa_peer.c \ unit_homa_pool.c \ unit_homa_plumbing.c \ - unit_homa_socktab.c \ + unit_homa_rpc.c \ + unit_homa_sock.c \ unit_homa_timer.c \ unit_homa_utils.c \ unit_timetrace.c +ifeq ($(__STRIP__),) +TEST_SRCS += unit_homa_grant.c \ + unit_homa_metrics.c \ + unit_homa_offload.c \ + unit_homa_pacer.c \ + unit_homa_qdisc.c \ + unit_homa_skb.c +endif TEST_OBJS := $(patsubst %.c,%.o,$(TEST_SRCS)) -HOMA_SRCS := homa_incoming.c \ - homa_offload.c \ +HOMA_SRCS := homa_devel.c \ + homa_interest.c \ + homa_incoming.c \ homa_outgoing.c \ - homa_peertab.c \ + homa_peer.c \ homa_pool.c \ homa_plumbing.c \ - homa_socktab.c \ + homa_rpc.c \ + homa_sock.c \ homa_timer.c \ homa_utils.c \ timetrace.c -HOMA_OBJS := $(patsubst %.c,%.o,$(HOMA_SRCS)) +ifeq ($(__STRIP__),) +HOMA_SRCS += homa_grant.c \ + homa_metrics.c \ + homa_offload.c \ + homa_pacer.c \ + homa_qdisc.c \ + homa_skb.c +endif +HOMA_OBJS := $(patsubst %.c,%.o,$(HOMA_SRCS)) rbtree.o rhashtable.o OTHER_SRCS := ccutils.cc \ main.c \ mock.c \ utils.c - OTHER_OBJS := $(patsubst %.c,%.o,$(patsubst %.cc,%.o,$(OTHER_SRCS))) OBJS := $(TEST_OBJS) $(HOMA_OBJS) $(OTHER_OBJS) CLEANS = unit $(OBJS) *.d .deps -all: run_tests - # This seems to be the only way to disable the built-in implicit rules # for %:%.c and %:%.cc. .SUFFIXES: @@ -92,12 +110,70 @@ all: run_tests %.e: %.cc $(CXX) -E $(CCFLAGS) $< -o $@ +dummyFile: + $(CXX) -c $(CCFLAGS) \ + -I $(LINUX_SRC_DIR)/include \ + -I $(LINUX_SRC_DIR)/arch/x86/include \ + -I $(LINUX_SRC_DIR)/arch/x86/include/generated \ + -include $(LINUX_SRC_DIR)/include/linux/kconfig.h \ + -O2 $< -o $@ + +# Note: Without -O2 there will be strange compiler errors such as +# 'asm operand 2 probably does not match constraints'. +rhashtable.o: rhashtable.c + gcc $(CINCLUDES) \ + -D__KERNEL__ -D__UNIT_TEST__ -O2 -g -std=gnu11 \ + -fno-strict-aliasing \ + -DKBUILD_MODFILE='"lib/rhashtable"' -DKBUILD_BASENAME='"rhashtable"' \ + -DKBUILD_MODNAME='"rhashtable"' -D__KBUILD_MODNAME=kmod_rhashtable \ + -c $< -o $@ + unit: $(OBJS) $(CXX) $(CFLAGS) $^ -o $@ -lasan -run_tests: unit +test: unit ./unit +# Additional definitions for running unit tests using stripped sources. + +S_HOMA_SRCS := $(patsubst %,stripped/%,$(filter-out timetrace.c, $(HOMA_SRCS))) +S_HOMA_OBJS := $(patsubst %.c,%.o,$(S_HOMA_SRCS)) +S_HOMA_HDRS := stripped/homa.h \ + stripped/homa_impl.h \ + stripped/homa_peer.h \ + stripped/homa_pool.h \ + stripped/homa_receiver.h \ + stripped/homa_rpc.h \ + stripped/homa_sock.h \ + stripped/homa_stub.h \ + stripped/homa_wire.h +stripped/%.c: ../%.c + ../util/strip.py --alt $< > $@ +stripped/%.h: ../%.h + ../util/strip.py --alt $< > $@ +S_TEST_OBJS := $(patsubst %,stripped/%,$(filter-out unit_timetrace.o, $(TEST_OBJS))) +S_OBJS := $(S_HOMA_OBJS) $(S_TEST_OBJS) $(patsubst %,stripped/%,$(OTHER_OBJS)) + +$(S_OBJS): | stripped $(S_HOMA_HDRS) + +stripped: + mkdir -p stripped + +stripped/%.o: stripped/%.c + $(CC) -c $(patsubst -I..,-Istripped,$(CFLAGS)) $< -o $@ +stripped/%.o: %.c + $(CC) -c $(patsubst -I..,-Istripped,$(CFLAGS)) $< -o $@ +stripped/%.o: %.cc + $(CXX) -c $(patsubst -I..,-Istripped,$(CCFLAGS)) $< -o $@ + +s_unit: $(S_OBJS) + $(CXX) $(CFLAGS) $^ -o $@ -lasan + +s_test: s_unit + ./s_unit + +CLEANS += s_unit + # The target below shouldn't be needed: theoretically, any code that is # sensitive to IPv4 vs. IPv6 should be tested explicitly, regardless of # the --ipv4 argument. @@ -108,13 +184,14 @@ test_both: unit ./unit clean: - rm -f unit $(CLEANS) + rm -f $(CLEANS) + rm -rf stripped # This magic (along with the -MD gcc option) automatically generates makefile # dependencies for header files included from C source files we compile, # and keeps those dependencies up-to-date every time we recompile. # See 'mergedep.pl' for more information. -.deps: $(wildcard *.d) +.deps: $(wildcard *.d stripped/*.d) @mkdir -p $(@D) $(PERL) mergedep.pl $@ $^ -include .deps @@ -123,4 +200,3 @@ clean: # prints the value of a make variable. print-%: @echo $* = $($*) - diff --git a/test/README.md b/test/README.md new file mode 100644 index 00000000..a2826ee9 --- /dev/null +++ b/test/README.md @@ -0,0 +1,60 @@ +This directory contains unit tests for the Homa Linux kernel module. +Here are a few overall notes: + +* These are "white box" tests, not "black box" tests. Tests are written + by looking at the code and writing enough tests to make sure all of the + major code elements are covered. + +* The structure of the unit tests is isomorphic to the structure of the + code: + * There is one test file in this directory for each code file. For example, + `unit_homa_incoming.c` contains unit tests for `../homa_incoming.c`. + * Within the test file, there is a block of tests for each function in the + corresponding code file, and the test blocks occur in the same order + as the functions. If you move functions around, move the tests around + to maintain isomorphism. + * The tests for each function are ordered according to which lines of code + in the function they test. Typically, a given test will test one or a few + lines of the function. The order of the tests matches the order of the + code ranges they test. With this approach, it's easy to scan the tests + for a function after you make changes the see if you need to add more + tests. + * Some functions will have an initial test labeled "basic" or "sanity check". + These initial tests may exercise a variety of features in the function; + remaining tests only need to cover things not exercised by the initial + test. + +* The name of a test indicates what function it is testing, and also gives + a very terse synopsis of what is being tested. For example, consider this + test from `homa_incoming.c`: + ``` + TEST_F(homa_incoming, homa_add_packet__packet_overlaps_message_end) + { + ... + } + ``` + The name of the test is `homa_add_packet__packet_overlaps_message_end`; + the test exercises the function `homa_add_packet`, and the particular + case is a new arriving packet that extends past the end of the message. + +* In general, tests should be disaggregated so that each test only tests a small + amount of functionality. Avoid large tests that test many different things. + +* In writing tests, focus on the control structure. For example, there should + be tests for each branch of an `if` statement. For loops, be sure to + include tests that involve multiple iterations of the loop. + +* You don't need to individually test each side effect of a collection of + straight-line statements; testing one or two of them is fine. + +* The file `mock.c` mocks out Linux kernel functions invoked by the code + being tested. Where relevant, the mocking code may record information about + how it was invoked and/or allow for the injection of errors in results. + +* It should be possible to exercise virtually every line of code in Homa. + If it appears that you cannot exercise a particular line, check to see + whether `mock.c` has mechanisms you can use to get the desired effect. + If not, consider extending `mock.c` to provide whatever you need. + +* Feel free to contact John Ousterhout if you're having trouble figuring out + how to test a particular piece of code. diff --git a/test/ccutils.cc b/test/ccutils.cc index 2c0073eb..fb9877e2 100644 --- a/test/ccutils.cc +++ b/test/ccutils.cc @@ -1,16 +1,5 @@ -/* Copyright (c) 2019-2022, Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2019-2022 Homa Developers + * SPDX-License-Identifier: BSD-1-Clause */ /* This file various utility functions for unit testing, which are @@ -124,8 +113,14 @@ int unit_hash_size(struct unit_hash *hash) */ void unit_hook(char *id) { + static bool hook_active = false; + + if (hook_active) + return; + hook_active = true; for (hook_func& func: hooks) func(id); + hook_active = false; } /** @@ -171,6 +166,7 @@ void unit_log_clear(void) void unit_fill_data(unsigned char *data, int length, int first_value) { int i; + for (i = 0; i <= length-4; i += 4) { *reinterpret_cast(data + i) = first_value + i; } @@ -204,6 +200,7 @@ void unit_log_add_separator(char *sep) void unit_log_data(const char *separator, unsigned char *data, int length) { int i, range_start, expected_next; + if (length == 0) { unit_log_printf(separator, "empty block"); return; @@ -215,6 +212,7 @@ void unit_log_data(const char *separator, unsigned char *data, int length) expected_next = range_start; for (i = 0; i <= length-4; i += 4) { int current = *reinterpret_cast(data + i); + if (current != expected_next) { unit_log_printf(separator, "%d-%d", range_start, expected_next-1); @@ -258,6 +256,7 @@ const char *unit_log_get(void) void unit_log_printf(const char *separator, const char* format, ...) { va_list ap; + va_start(ap, format); if (!unit_log.empty() && (separator != NULL)) @@ -268,10 +267,12 @@ void unit_log_printf(const char *separator, const char* format, ...) int buf_size = 1024; while (true) { char buf[buf_size]; - // vsnprintf trashes the va_list, so copy it first va_list aq; + int length; + + // vsnprintf trashes the va_list, so copy it first __va_copy(aq, ap); - int length = vsnprintf(buf, buf_size, format, aq); + length = vsnprintf(buf, buf_size, format, aq); assert(length >= 0); // old glibc versions returned -1 if (length < buf_size) { unit_log.append(buf, length); diff --git a/test/ccutils.h b/test/ccutils.h index 158db5cd..f56dc5d7 100644 --- a/test/ccutils.h +++ b/test/ccutils.h @@ -1,16 +1,5 @@ -/* Copyright (c) 2019-2022, Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2019-2022 Homa Developers + * SPDX-License-Identifier: BSD-1-Clause */ /* Utility functions for unit tests, implemented in C++. */ diff --git a/test/kselftest_harness.h b/test/kselftest_harness.h index 44f62618..d0ae3b9d 100644 --- a/test/kselftest_harness.h +++ b/test/kselftest_harness.h @@ -56,7 +56,10 @@ * and compiling them into a normal Linux executable along with the * unit tests). This creates potential problems with conflicts between * kernel header files and user-level header files. To avoid these conflicts, - * this file must be very careful about what headers it includes. + * this file must be very careful about what headers it includes. This file + * is based on a relatively old version of the official file; new versions + * generate even more header file conflicts, which appear very difficult + * to resolve. * This file also contains several other changes, such as: * - All tests run in a single process, rather than forking a child process * for each test. @@ -627,7 +630,7 @@ extern int strcmp(const char *s1, const char *s2); if (!(__exp _t __seen)) { \ unsigned long long __exp_print = (long long)__exp; \ unsigned long long __seen_print = (long long)__seen; \ - __TH_LOG(" Expected %s (%llu) %s %s (%llu)", \ + __TH_LOG(" Expected %s (%lld) %s %s (%lld)", \ #_expected, __exp_print, #_t, \ #_seen, __seen_print); \ __current_test->passed = 0; \ @@ -822,9 +825,7 @@ void __run_test(struct __test_metadata *t) * @verbose: Nonzero means print all test names as they run; zero means print * only for test failures. */ -static int test_harness_run(int __attribute__((unused)) argc, - char __attribute__((unused)) **argv, - int verbose) +static int test_harness_run(int argc, char **argv, int verbose) { struct __test_metadata *t; int ret = 0; diff --git a/test/main.c b/test/main.c index 61fb7019..d760827f 100644 --- a/test/main.c +++ b/test/main.c @@ -1,49 +1,32 @@ -/* Copyright (c) 2019-2022, Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* Main program for running Homa unit tests. */ +#define __NO_KSELFTEST__ 1 #include "homa_impl.h" #include "kselftest_harness.h" #include "mock.h" -static char * helpMessage = - "This program runs unit tests written in the Linux kernel kselftest " - "style.\n" +static char *helpMessage = + "This program runs unit tests written in the Linux kernel kselftest style.\n" " Usage: %s options test_name test_name ...\n" "The following options are supported:\n" " --help or -h Print this message\n" - " --ipv4 Simulate IPv4 for all packets (default: " - "use IPv6)\n" - " --verbose or -v Print the names of all tests as they run " - "(default:\n" + " --ipv4 Simulate IPv4 for all packets (default: use IPv6)\n" + " --verbose or -v Print the names of all tests as they run (default:\n" " print only tests that fail)\n" - "If one or more test_name arguments are provided, then only those " - "tests are\n" + "If one or more test_name arguments are provided, then only those tests are\n" "run; if no test names are provided, then all tests are run.\n" - "\n" - "Note: the tests should provide complete coverage of both IPv4 and " - "IPv6 without\n" - "using the --ipv4 argument (code that depends on IPv4 vs. IPv6 " - "already has\n" - "special test cases for each); --ipv4 is provided for occasional " - "double-checking.\n"; + "\n" + "Note: the tests should provide complete coverage of both IPv4 and IPv6 without\n" + "using the --ipv4 argument (code that depends on IPv4 vs. IPv6 already has\n" + "special test cases for each); --ipv4 is provided for occasional double-checking.\n"; -int main(int argc, char **argv) { - int i; +int main(int argc, char **argv) +{ int verbose = 0; + int i; + mock_ipv6_default = true; for (i = 1; i < argc; i++) { if ((strcmp(argv[i], "-h") == 0) || @@ -59,9 +42,8 @@ int main(int argc, char **argv) { printf("Unknown option %s; type '%s --help' for help\n", argv[i], argv[0]); return 1; - } else { + } else break; - } } test_harness_run(argc-i, argv+i, verbose); -} \ No newline at end of file +} diff --git a/test/mock.c b/test/mock.c index 9f87a134..9a0e3e15 100644 --- a/test/mock.c +++ b/test/mock.c @@ -1,30 +1,20 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file provides simplified substitutes for many Linux variables and - * functions, in order to allow Homa unit tests to be run outside a Linux + * functions in order to allow Homa unit tests to be run outside a Linux * kernel. */ #include "homa_impl.h" +#include "homa_pool.h" +#ifndef __STRIP__ /* See strip.py */ +#include "homa_qdisc.h" +#include "homa_skb.h" +#endif /* See strip.py */ #include "ccutils.h" -#include "mock.h" #include "utils.h" -#define KSELFTEST_NOT_MAIN 1m -#include "kselftest_harness.h" +#include /* It isn't safe to include some header files, such as stdlib, because * they conflict with kernel header files. The explicit declarations @@ -33,6 +23,9 @@ extern void free(void *ptr); extern void *malloc(size_t size); +#ifdef memcpy +#undef memcpy +#endif extern void *memcpy(void *dest, const void *src, size_t n); /* The variables below can be set to non-zero values by unit tests in order @@ -40,48 +33,65 @@ extern void *memcpy(void *dest, const void *src, size_t n); * the next call to the function will fail; bit 1 corresponds to the next * call after that, and so on. */ -int mock_alloc_skb_errors = 0; -int mock_copy_data_errors = 0; -int mock_copy_to_iter_errors = 0; -int mock_copy_to_user_errors = 0; -int mock_cpu_idle = 0; -int mock_import_single_range_errors = 0; -int mock_import_iovec_errors = 0; -int mock_ip6_xmit_errors = 0; -int mock_ip_queue_xmit_errors = 0; -int mock_kmalloc_errors = 0; -int mock_route_errors = 0; -int mock_spin_lock_held = 0; -int mock_trylock_errors = 0; -int mock_vmalloc_errors = 0; +int mock_alloc_page_errors; +int mock_alloc_skb_errors; +int mock_cmpxchg_errors; +int mock_copy_data_errors; +int mock_copy_to_iter_errors; +int mock_copy_to_user_errors; +int mock_cpu_idle; +int mock_dst_check_errors; +int mock_ethtool_ksettings_errors; +int mock_import_ubuf_errors; +int mock_import_iovec_errors; +int mock_ip6_xmit_errors; +int mock_ip_queue_xmit_errors; +int mock_kmalloc_errors; +int mock_kthread_create_errors; +int mock_prepare_to_wait_errors; +int mock_register_protosw_errors; +int mock_register_qdisc_errors; +int mock_register_sysctl_errors; +int mock_rht_init_errors; +int mock_rht_insert_errors; +int mock_route_errors; +int mock_spin_lock_held; +int mock_trylock_errors; +int mock_vmalloc_errors; +int mock_wait_intr_irq_errors; + +/* The value that prepare_to_wait_event should return when + * mock_prepare_to_wait_errors is nonzero. + */ +int mock_prepare_to_wait_status = -ERESTARTSYS; /* The return value from calls to signal_pending(). */ -int mock_signal_pending = 0; +int mock_signal_pending; -/* Used as current task during tests. */ +/* Used as current task during tests. Also returned by kthread_run. */ struct task_struct mock_task; /* If a test sets this variable to nonzero, ip_queue_xmit will log * outgoing packets using the long format rather than short. */ -int mock_xmit_log_verbose = 0; +int mock_xmit_log_verbose; -/* If a test sets this variable to nonzero, call_rcu_sched will log - * whenever it is invoked. +/* If a test sets this variable to nonzero, calls to wake_up and + * wake_up_all will be logged. */ -int mock_log_rcu_sched = 0; +int mock_log_wakeups; -/* The maximum number of grants that can be issued in one call to - * homa_send_grants. +/* If a test sets this variable to nonzero, call_rcu_sched will log + * whenever it is invoked. */ -int mock_max_grants = 10; +int mock_log_rcu_sched; /* A zero value means that copy_to_user will actually copy bytes to * the destination address; if nonzero, then 0 bits determine which * copies actually occur (bit 0 for the first copy, etc., just like * error masks). */ -int mock_copy_to_user_dont_copy = 0; +int mock_copy_to_user_dont_copy; /* HOMA_BPAGE_SIZE will evaluate to this. */ int mock_bpage_size = 0x10000; @@ -89,47 +99,104 @@ int mock_bpage_size = 0x10000; /* HOMA_BPAGE_SHIFT will evaluate to this. */ int mock_bpage_shift = 16; -/* Keeps track of all sk_buffs that are alive in the current test. +/* Keeps track of all the spinlocks that have been locked but not unlocked. * Reset for each test. */ -static struct unit_hash *buffs_in_use = NULL; +static struct unit_hash *spinlocks_held; /* Keeps track of all the blocks of memory that have been allocated by * kmalloc but not yet freed by kfree. Reset for each test. */ -static struct unit_hash *kmallocs_in_use = NULL; +static struct unit_hash *kmallocs_in_use; /* Keeps track of all the results returned by proc_create that have not * yet been closed by calling proc_remove. Reset for each test. */ -static struct unit_hash *proc_files_in_use = NULL; +static struct unit_hash *proc_files_in_use; + +/* Keeps track of all the results returned by alloc_pages that have + * not yet been released by calling put_page. The value of each entry is + * a (char *) giving the reference count for the page. Reset for each test. + */ +static struct unit_hash *pages_in_use; + +/* Number of qdiscs that have been registered but not yet unregistered + * during the current test. Reset for each test. + */ +static int registered_qdiscs; + +/* Registered by most recent call to register_qdisc. */ +static struct Qdisc_ops *qdisc_ops; /* Keeps track of all the results returned by ip_route_output_flow that - * have not yet been freed. Reset for each test. */ -static struct unit_hash *routes_in_use = NULL; + * have not yet been freed. Reset for each test. + */ +static struct unit_hash *routes_in_use; + +/* Keeps track of all sk_buffs that are alive in the current test. + * Reset for each test. + */ +static struct unit_hash *skbs_in_use; /* Keeps track of all the blocks of memory that have been allocated by * vmalloc but not yet freed by vfree. Reset for each test. */ -static struct unit_hash *vmallocs_in_use = NULL; +static struct unit_hash *vmallocs_in_use; -/* The number of locks that have been acquired but not yet released. - * Should be 0 at the end of each test. +/* The number of locks (other than spin locks) that have been acquired + * but not yet released. Should be 0 at the end of each test. */ -static int mock_active_locks = 0; +static int mock_active_locks; + +/* Total number of successful spinlock acquisitions during current test. */ +int mock_total_spin_locks; /* The number of times rcu_read_lock has been called minus the number * of times rcu_read_unlock has been called. * Should be 0 at the end of each test. */ -static int mock_active_rcu_locks = 0; +static int mock_active_rcu_locks; + +/* Number of calls to sock_hold that haven't been matched with calls + * to sock_put. + */ +int mock_sock_holds; + +/* Number of calls to homa_rpc_hold that haven't been matched with calls + * to homa_rpc_put. + */ +int mock_rpc_holds; + +/* The number of times preempt_disable() has been invoked, minus the + * number of times preempt_enable has been invoked. + */ +static int mock_preempt_disables; + +/* Used as the return value for calls to homa_clock. */ +u64 mock_clock; + +/* Add this value to mock_clock every time homa_clock is invoked. */ +u64 mock_clock_tick; + +/* If values are present here, use them as the return values from + * homa_clock, without considering mock_clock or mock_clock_tick. + */ +#define MAX_CLOCK_VALS 10 +u64 mock_clock_vals[MAX_CLOCK_VALS]; +int mock_next_clock_val = 0; +int mock_num_clock_vals = 0; -/* Used as the return value for calls to get_cycles. A value of ~0 means - * return actual clock time. +/* Used as the return value for tt_get_cycles. */ +u64 mock_tt_cycles; + +unsigned int tsc_khz = 1000000; + +/* True means that kthread_stop has been invoked for some thread, + * so kthread_should_stop should return true. */ -cycles_t mock_cycles = 0; +bool mock_exit_thread; -/* Indicates whether we should be simulation IPv6 or IPv4 in the +/* Indicates whether we should be simulating IPv6 or IPv4 in the * current test. Can be overridden by a test. */ bool mock_ipv6 = true; @@ -137,63 +204,138 @@ bool mock_ipv6 = true; /* The value to use for mock_ipv6 in each test unless overridden. */ bool mock_ipv6_default; -/* Linux's idea of the current CPU number. */ -int cpu_number = 1; - /* List of priorities for all outbound packets. */ char mock_xmit_prios[1000]; -int mock_xmit_prios_offset = 0; +int mock_xmit_prios_offset; -/* Maximum packet size allowed by "network" (see homa_message_out_init; +/* Maximum packet size allowed by "network" (see homa_message_out_fill; * chosen so that data packets will have UNIT_TEST_DATA_PER_PACKET bytes * of payload. The variable can be modified if useful in some tests. * Set by mock_sock_init. */ -int mock_mtu = 0; +int mock_mtu; + +/* Used instead of MAX_SKB_FRAGS when running some unit tests. */ +int mock_max_skb_frags = MAX_SKB_FRAGS; + +/* Each bit gives the NUMA node (0 or 1) for a particular core.*/ +int mock_numa_mask = 5; + +/* Bits determine the result of successive calls to compound order, starting + * at the lowest bit. 0 means return HOMA_SKB_PAGE_ORDER, 1 means return 0. + */ +int mock_compound_order_mask; + +/* Bits specify the NUMA node number that will be returned by the next + * calls to mock_page_to_nid, starting with the low-order bit. + */ +int mock_page_nid_mask; + +/* Used to collect printk output. */ +char mock_printk_output [5000]; + +/* Used as the return values from rhashtable_walk_next calls. */ +void **mock_rht_walk_results; +int mock_rht_num_walk_results; + +/* Used instead of HOMA_MIN_DEFAULT_PORT by homa_skb.c. */ +__u16 mock_min_default_port = 0x8000; + +/* Used as sk_socket for all sockets created by mock_sock_init. + * Its sk field points to the most recently created Homa socket. */ +static struct socket mock_socket; + +/* Each of the entries in mock_hnets below is associated with the + * corresonding entry in mock_nets. + */ +#define MOCK_MAX_NETS 10 +struct net mock_nets[MOCK_MAX_NETS]; +struct homa_net *mock_hnets[MOCK_MAX_NETS]; +struct net_device mock_devices[MOCK_MAX_NETS]; -struct dst_ops mock_dst_ops = {.mtu = mock_get_mtu}; -struct net_device mock_net_device = { - .gso_max_segs = 1000, - .gso_max_size = 0}; +/* Nonzero means don't generate a unit test failure when freeing peers + * if the reference count isn't zero (log a message instead). + */ +int mock_peer_free_no_fail; + +/* Link speed to return from mock_get_link_ksettings. */ +int mock_link_mbps = 10000; + +struct ethtool_ops mock_ethtool_ops = + {.get_link_ksettings = mock_get_link_ksettings}; +struct dst_ops mock_dst_ops = { + .mtu = mock_get_mtu, + .check = mock_dst_check}; +struct netdev_queue mock_net_queue = {.state = 0}; + +/* Use this as the dev queue index in new skbs. */ +int mock_queue_index = 0; + +/* Number of invocations of netif_schedule_queue. */ +int mock_netif_schedule_calls; + +const struct net_offload *inet_offloads[MAX_INET_PROTOS]; +const struct net_offload *inet6_offloads[MAX_INET_PROTOS]; +struct net_offload tcp_offload; +struct net_offload tcp_v6_offload; static struct hrtimer_clock_base clock_base; -unsigned int cpu_khz = 1000000; struct task_struct *current_task = &mock_task; -unsigned long ex_handler_refcount = 0; +unsigned long ex_handler_refcount; struct net init_net; unsigned long volatile jiffies = 1100; unsigned int nr_cpu_ids = 8; -unsigned long page_offset_base = 0; -unsigned long phys_base = 0; -unsigned long vmemmap_base = 0; -int __preempt_count = 0; +unsigned long page_offset_base; +unsigned long phys_base; +unsigned long vmemmap_base; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0) +kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES]; +#endif +int __preempt_count; +int cpu_number = 1; char sock_flow_table[RPS_SOCK_FLOW_TABLE_SIZE(1024)]; -struct rps_sock_flow_table *rps_sock_flow_table - = (struct rps_sock_flow_table *) sock_flow_table; -__u32 rps_cpu_mask = 0x1f; +struct net_hotdata net_hotdata = { + .rps_cpu_mask = 0x1f, + .rps_sock_flow_table = (struct rps_sock_flow_table *) sock_flow_table +}; +int debug_locks; +struct static_call_key __SCK__cond_resched; +struct static_call_key __SCK__might_resched; +struct static_call_key __SCK__preempt_schedule; +struct paravirt_patch_template pv_ops; +struct workqueue_struct *system_wq; +struct static_key_true validate_usercopy_range; +unsigned long __per_cpu_offset[NR_CPUS]; +struct tracepoint __tracepoint_sched_set_state_tp; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +struct lockdep_map rcu_lock_map; +#endif /* CONFIG_DEBUG_LOCK_ALLOC */ extern void add_wait_queue(struct wait_queue_head *wq_head, - struct wait_queue_entry *wq_entry) {} + struct wait_queue_entry *wq_entry) +{} struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, int node) { + struct sk_buff *skb; int shinfo_size; + if (mock_check_error(&mock_alloc_skb_errors)) return NULL; - struct sk_buff *skb = malloc(sizeof(struct sk_buff)); + skb = malloc(sizeof(struct sk_buff)); if (skb == NULL) - FAIL("skb malloc failed in __alloc_skb"); + FAIL(" skb malloc failed in %s", __func__); memset(skb, 0, sizeof(*skb)); - if (!buffs_in_use) - buffs_in_use = unit_hash_new(); - unit_hash_set(buffs_in_use, skb, "used"); - size = SKB_DATA_ALIGN(size); + if (!skbs_in_use) + skbs_in_use = unit_hash_new(); + unit_hash_set(skbs_in_use, skb, "used"); shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); skb->head = malloc(size + shinfo_size); memset(skb->head, 0, size + shinfo_size); if (skb->head == NULL) - FAIL("data malloc failed in __alloc_skb"); + FAIL(" data malloc failed in %s", __func__); skb->data = skb->head; skb_reset_tail_pointer(skb); skb->end = skb->tail + size; @@ -204,33 +346,53 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, skb->users.refs.counter = 1; skb->_skb_refdst = 0; ip_hdr(skb)->saddr = 0; - skb->truesize = size; + skb->truesize = SKB_TRUESIZE(size); + skb->dev = &mock_devices[0]; return skb; } -void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) +int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, + int sync, void *key) +{ + return 0; +} + +void BUG_func(void) +{} + +void call_rcu(struct rcu_head *head, void free_func(struct rcu_head *head)) +{ + unit_log_printf("; ", "call_rcu invoked"); + free_func(head); +} + +bool cancel_work_sync(struct work_struct *work) { - if (mock_log_rcu_sched) - unit_log_printf("; ", "call_rcu_sched"); - func(head); + return false; } void __check_object_size(const void *ptr, unsigned long n, bool to_user) {} +void consume_skb(struct sk_buff *skb) { + kfree_skb(skb); +} + size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *iter) { size_t bytes_left = bytes; + if (mock_check_error(&mock_copy_data_errors)) return false; if (bytes > iter->count) { - unit_log_printf("; ", "copy_from_iter needs %lu bytes, but " - "iov_iter has only %lu", bytes, iter->count); + unit_log_printf("; ", "copy_from_iter needs %lu bytes, but iov_iter has only %lu", bytes, + iter->count); return 0; } while (bytes_left > 0) { - struct iovec *iov = (struct iovec *) iter->iov; - __u64 int_base = (__u64) iov->iov_base; + struct iovec *iov = (struct iovec *) iter_iov(iter); + u64 int_base = (u64) iov->iov_base; size_t chunk_bytes = iov->iov_len; + if (chunk_bytes > bytes_left) chunk_bytes = bytes_left; unit_log_printf("; ", "_copy_from_iter %lu bytes at %llu", @@ -240,7 +402,7 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *iter) iov->iov_base = (void *) (int_base + chunk_bytes); iov->iov_len -= chunk_bytes; if (iov->iov_len == 0) - iter->iov++; + iter->__iov++; } return bytes; } @@ -284,7 +446,8 @@ unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n) { - __u64 int_from = (__u64) from; + u64 int_from = (u64) from; + if (mock_check_error(&mock_copy_data_errors)) return 1; if (int_from > 200000) @@ -298,20 +461,30 @@ void __copy_overflow(int size, unsigned long count) abort(); } -void do_exit(long error_code) +#ifdef CONFIG_DEBUG_LOCK_ALLOC +int debug_lockdep_rcu_enabled(void) +{ + return 0; +} +#endif + +int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *) { - while(1) {} + UNIT_HOOK("do_wait_intr_irq"); + if (mock_check_error(&mock_wait_intr_irq_errors)) + return -ERESTARTSYS; + return 0; } void dst_release(struct dst_entry *dst) { if (!dst) return; - dst->__refcnt.counter--; - if (dst->__refcnt.counter > 0) + atomic_dec(&dst->__rcuref.refcnt); + if (atomic_read(&dst->__rcuref.refcnt) > 0) return; if (!routes_in_use || unit_hash_get(routes_in_use, dst) == NULL) { - FAIL("dst_release on unknown route"); + FAIL(" %s on unknown route", __func__); return; } unit_hash_erase(routes_in_use, dst); @@ -319,17 +492,19 @@ void dst_release(struct dst_entry *dst) } void finish_wait(struct wait_queue_head *wq_head, - struct wait_queue_entry *wq_entry) {} + struct wait_queue_entry *wq_entry) +{} -#if LINUX_VERSION_CODE < KERNEL_VERSION(5,18,0) -void get_random_bytes(void *buf, int nbytes) -#else void get_random_bytes(void *buf, size_t nbytes) -#endif { memset(buf, 0, nbytes); } +u32 get_random_u32(void) +{ + return 0; +} + int hrtimer_cancel(struct hrtimer *timer) { return 0; @@ -353,8 +528,18 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, clock_base.get_time = &hrtimer_get_time; } +void hrtimer_setup(struct hrtimer *timer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode) +{ + timer->base = &clock_base; + clock_base.get_time = &hrtimer_get_time; + timer->function = function; +} + void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - u64 range_ns, const enum hrtimer_mode mode) {} + u64 range_ns, const enum hrtimer_mode mode) +{} void __icmp_send(struct sk_buff *skb, int type, int code, __be32 info, const struct ip_options *opt) @@ -362,7 +547,7 @@ void __icmp_send(struct sk_buff *skb, int type, int code, __be32 info, unit_log_printf("; ", "icmp_send type %d, code %d", type, code); } -void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, +void icmp6_send(struct sk_buff *skb, u8 type, u8 code, u32 info, const struct in6_addr *force_saddr, const struct inet6_skb_parm *parm) { @@ -374,14 +559,14 @@ int idle_cpu(int cpu) return mock_check_error(&mock_cpu_idle); } -ssize_t import_iovec(int type, const struct iovec __user * uvector, - unsigned nr_segs, unsigned fast_segs, +ssize_t import_iovec(int type, const struct iovec __user *uvector, + unsigned int nr_segs, unsigned int fast_segs, struct iovec **iov, struct iov_iter *iter) { ssize_t size; - unsigned i; + unsigned int i; - *iov = (struct iovec *) kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); + *iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); if (mock_check_error(&mock_import_iovec_errors)) return -EINVAL; size = 0; @@ -393,14 +578,11 @@ ssize_t import_iovec(int type, const struct iovec __user * uvector, return size; } -int import_single_range(int type, void __user *buf, size_t len, - struct iovec *iov, struct iov_iter *i) +int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i) { - if (mock_check_error(&mock_import_single_range_errors)) + if (mock_check_error(&mock_import_ubuf_errors)) return -EACCES; - iov->iov_base = buf; - iov->iov_len = len; - iov_iter_init(i, type, iov, 1, len); + iov_iter_ubuf(i, rw, buf, len); return 0; } @@ -436,6 +618,8 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) int inet6_register_protosw(struct inet_protosw *p) { + if (mock_check_error(&mock_register_protosw_errors)) + return -EINVAL; return 0; } @@ -488,7 +672,8 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, return 0; } -void inet_register_protosw(struct inet_protosw *p) {} +void inet_register_protosw(struct inet_protosw *p) +{} int inet_release(struct socket *sock) { @@ -500,10 +685,19 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) return 0; } -void inet_unregister_protosw(struct inet_protosw *p) {} +void inet_unregister_protosw(struct inet_protosw *p) +{} void __init_swait_queue_head(struct swait_queue_head *q, const char *name, - struct lock_class_key *key) {} + struct lock_class_key *key) +{} + +void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) +{} + +void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, + struct lock_class_key *) +{} void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, @@ -511,7 +705,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction, { direction &= READ | WRITE; i->iter_type = ITER_IOVEC | direction; - i->iov = iov; + i->__iov = iov; i->nr_segs = nr_segs; i->iov_offset = 0; i->count = count; @@ -527,25 +721,28 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len) return 0; } -void ip6_datagram_release_cb(struct sock *sk) +struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) { + if (mock_check_error(&mock_dst_check_errors)) + return NULL; + return dst; } struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst) { + struct rtable *route; + if (mock_check_error(&mock_route_errors)) return ERR_PTR(-EHOSTUNREACH); - - struct rtable *route; route = malloc(sizeof(struct rtable)); if (!route) { - FAIL("malloc failed"); + FAIL(" malloc failed"); return ERR_PTR(-ENOMEM); } - route->dst.__refcnt.counter = 1; + atomic_set(&route->dst.__rcuref.refcnt, 1); route->dst.ops = &mock_dst_ops; - route->dst.dev = &mock_net_device; + route->dst.dev = &mock_devices[0]; route->dst.obsolete = 0; if (!routes_in_use) routes_in_use = unit_hash_new(); @@ -559,10 +756,11 @@ unsigned int ip6_mtu(const struct dst_entry *dst) } int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, - __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) + u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) { char buffer[200]; const char *prefix = " "; + if (mock_check_error(&mock_ip6_xmit_errors)) { kfree_skb(skb); return -ENETDOWN; @@ -572,7 +770,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, mock_xmit_prios_offset += snprintf( mock_xmit_prios + mock_xmit_prios_offset, sizeof(mock_xmit_prios) - mock_xmit_prios_offset, - "%s%d", prefix, tclass >> 4); + "%s%d", prefix, tclass >> 5); if (mock_xmit_log_verbose) homa_print_packet(skb, buffer, sizeof(buffer)); else @@ -584,8 +782,9 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { - char buffer[200]; const char *prefix = " "; + char buffer[200]; + if (mock_check_error(&mock_ip_queue_xmit_errors)) { /* Latest data (as of 1/2019) suggests that ip_queue_xmit * frees packets after errors. @@ -608,6 +807,13 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) return 0; } +struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) +{ + if (mock_check_error(&mock_dst_check_errors)) + return NULL; + return dst; +} + unsigned int ipv4_mtu(const struct dst_entry *dst) { return mock_mtu; @@ -617,16 +823,17 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, const struct sock *sk) { struct rtable *route; + if (mock_check_error(&mock_route_errors)) return ERR_PTR(-EHOSTUNREACH); route = malloc(sizeof(struct rtable)); if (!route) { - FAIL("malloc failed"); + FAIL(" malloc failed"); return ERR_PTR(-ENOMEM); } - route->dst.__refcnt.counter = 1; + atomic_set(&route->dst.__rcuref.refcnt, 1); route->dst.ops = &mock_dst_ops; - route->dst.dev = &mock_net_device; + route->dst.dev = &mock_devices[0]; route->dst.obsolete = 0; if (!routes_in_use) routes_in_use = unit_hash_new(); @@ -640,91 +847,226 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, return 0; } -void ip4_datagram_release_cb(struct sock *sk) {} +void device_set_wakeup_capable(struct device *dev, bool capable) +{} + +void device_wakeup_disable(struct device *dev) +{} + +int device_wakeup_enable(struct device *dev) +{ + return 0; +} + +int filp_close(struct file *, fl_owner_t id) +{ + return 0; +} + +struct file *filp_open(const char *, int, umode_t) +{ + return NULL; +} + +void __fortify_panic(const u8 reason, const size_t avail, const size_t size) +{ + FAIL(" __fortify_panic invoked"); + + /* API prohibits return. */ + while (1) ; +} + +ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) +{ + return 0; +} + +ssize_t kernel_write(struct file *file, const void *buf, size_t count, + loff_t *pos) +{ + return 0; +} void kfree(const void *block) { if (block == NULL) return; + UNIT_HOOK("kfree"); if (!kmallocs_in_use || unit_hash_get(kmallocs_in_use, block) == NULL) { - FAIL("kfree on unknown block"); + FAIL(" %s on unknown block %p", __func__, block); return; } unit_hash_erase(kmallocs_in_use, block); free((void *) block); } +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) +#else +void __kfree_skb(struct sk_buff *skb) +#endif { + int i; + struct skb_shared_info *shinfo = skb_shinfo(skb); + skb->users.refs.counter--; if (skb->users.refs.counter > 0) return; skb_dst_drop(skb); - if (!buffs_in_use || unit_hash_get(buffs_in_use, skb) == NULL) { - FAIL("kfree_skb on unknown sk_buff"); + if (!skbs_in_use || unit_hash_get(skbs_in_use, skb) == NULL) { + FAIL(" kfree_skb on unknown sk_buff"); return; } - unit_hash_erase(buffs_in_use, skb); - while (skb_shinfo(skb)->frag_list) { - struct sk_buff *next = skb_shinfo(skb)->frag_list->next; - kfree_skb(skb_shinfo(skb)->frag_list); - skb_shinfo(skb)->frag_list = next; + unit_hash_erase(skbs_in_use, skb); + while (shinfo->frag_list) { + struct sk_buff *next = shinfo->frag_list->next; + + kfree_skb(shinfo->frag_list); + shinfo->frag_list = next; } + for (i = 0; i < shinfo->nr_frags; i++) + put_page(skb_frag_page(&shinfo->frags[i])); free(skb->head); free(skb); } +void kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason) +{ + while (segs) { + struct sk_buff *next = segs->next; + + __kfree_skb(segs); + segs = next; + } +} + +void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size) +{ + return mock_kmalloc(size, gfpflags); +} + +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP +void __might_sleep(const char *file, int line) +{ + UNIT_HOOK("might_sleep"); +} +#endif + void *mock_kmalloc(size_t size, gfp_t flags) { + void *block; + + UNIT_HOOK("kmalloc"); if (mock_check_error(&mock_kmalloc_errors)) return NULL; - void *block = malloc(size); + if (unit_hash_size(spinlocks_held) > 0 && + (flags & ~__GFP_ZERO) != GFP_ATOMIC) + FAIL(" Incorrect flags 0x%x passed to mock_kmalloc; expected GFP_ATOMIC (0x%x)", + flags, GFP_ATOMIC); + block = malloc(size); if (!block) { - FAIL("malloc failed"); + FAIL(" malloc failed"); return NULL; } + if (flags & __GFP_ZERO) + memset(block, 0, size); if (!kmallocs_in_use) kmallocs_in_use = unit_hash_new(); unit_hash_set(kmallocs_in_use, block, "used"); return block; } +void *__kmalloc_noprof(size_t size, gfp_t flags) +{ + return mock_kmalloc(size, flags); +} + +void kvfree(const void *addr) +{ + kfree(addr); +} + +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) +{ + return mock_kmalloc(size, flags); +} + struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], ...) { - return NULL; + if (mock_check_error(&mock_kthread_create_errors)) + return ERR_PTR(-EACCES); + return &mock_task; +} + +bool kthread_should_stop(void) { + return mock_exit_thread; } int kthread_stop(struct task_struct *k) { + unit_log_printf("; ", "kthread_stop"); + mock_exit_thread = true; return 0; } #ifdef CONFIG_DEBUG_LIST -bool __list_add_valid(struct list_head *new, - struct list_head *prev, - struct list_head *next) +bool __list_add_valid(struct list_head *new, struct list_head *prev, + struct list_head *next) +{ + return true; +} +#endif + +bool __list_add_valid_or_report(struct list_head *new, struct list_head *prev, + struct list_head *next) { return true; } +#ifdef CONFIG_DEBUG_LIST bool __list_del_entry_valid(struct list_head *entry) { return true; } #endif +bool __list_del_entry_valid_or_report(struct list_head *entry) +{ + return true; +} + void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) {} +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void lock_acquire(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, + struct lockdep_map *nest_lock, unsigned long ip) +{} + +void lockdep_rcu_suspicious(const char *file, const int line, const char *s) +{} +#endif + +int lock_is_held_type(const struct lockdep_map *lock, int read) +{ + return 0; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void lock_release(struct lockdep_map *lock, unsigned long ip) +{} +#endif + void lock_sock_nested(struct sock *sk, int subclass) { mock_active_locks++; sk->sk_lock.owned = 1; } -ssize_t __modver_version_show(struct module_attribute *a, +ssize_t __modver_version_show(const struct module_attribute *a, struct module_kobject *b, char *c) { return 0; @@ -732,37 +1074,94 @@ ssize_t __modver_version_show(struct module_attribute *a, void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) -{ - -} +{} +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void mutex_lock_nested(struct mutex *lock, unsigned int subclass) +#else void mutex_lock(struct mutex *lock) +#endif { + UNIT_HOOK("mutex_lock"); mock_active_locks++; } void mutex_unlock(struct mutex *lock) { + UNIT_HOOK("unlock"); mock_active_locks--; } +void netif_schedule_queue(struct netdev_queue *txq) +{ + mock_netif_schedule_calls++; +} + int netif_receive_skb(struct sk_buff *skb) { - struct data_header *h = (struct data_header *) + struct homa_data_hdr *h = (struct homa_data_hdr *) skb_transport_header(skb); unit_log_printf("; ", "netif_receive_skb, id %llu, offset %d", be64_to_cpu(h->common.sender_id), ntohl(h->seg.offset)); return 0; } +void __netif_schedule(struct Qdisc *q) +{} + +void preempt_count_add(int val) +{ + int i; + + for (i = 0; i < val; i++) + preempt_disable(); +} + +void preempt_count_sub(int val) +{ + int i; + + for (i = 0; i < val; i++) + preempt_enable(); +} + long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) { + UNIT_HOOK("prepare_to_wait"); + if (mock_check_error(&mock_prepare_to_wait_errors)) + return mock_prepare_to_wait_status; return 0; } -int _printk(const char *fmt, ...) +int _printk(const char *format, ...) { + int len = strlen(mock_printk_output); + int available; + va_list ap; + + available = sizeof(mock_printk_output) - len; + if (available >= 10) { + if (len != 0) { + strcpy(mock_printk_output + len, "; "); + len += 2; + available -= 2; + } + va_start(ap, format); + + /* Skip initial characters of format that are used to + * indicate priority. + */ + if (format[0] == 1) + format += 2; + vsnprintf(mock_printk_output + len, available, format, ap); + va_end(ap); + + /* Remove trailing newline. */ + len += strlen(mock_printk_output + len); + if (mock_printk_output[len-1] == '\n') + mock_printk_output[len-1] = 0; + } return 0; } @@ -771,8 +1170,9 @@ struct proc_dir_entry *proc_create(const char *name, umode_t mode, const struct proc_ops *proc_ops) { struct proc_dir_entry *entry = malloc(40); + if (!entry) { - FAIL("malloc failed"); + FAIL(" malloc failed"); return ERR_PTR(-ENOMEM); } if (!proc_files_in_use) @@ -781,17 +1181,24 @@ struct proc_dir_entry *proc_create(const char *name, umode_t mode, return entry; } +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) int proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) +#else +int proc_dointvec(const struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +#endif { return 0; } void proc_remove(struct proc_dir_entry *de) { + if (!de) + return; if (!proc_files_in_use || unit_hash_get(proc_files_in_use, de) == NULL) { - FAIL("proc_remove on unknown dir_entry"); + FAIL(" %s on unknown dir_entry", __func__); return; } unit_hash_erase(proc_files_in_use, de); @@ -811,91 +1218,239 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) return NULL; } +bool queue_work_on(int cpu, struct workqueue_struct *wq, + struct work_struct *work) +{ + return true; +} + void _raw_spin_lock(raw_spinlock_t *lock) { - mock_active_locks++; + mock_record_locked(lock); + mock_total_spin_locks++; } void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) { UNIT_HOOK("spin_lock"); - mock_active_locks++; + mock_record_locked(lock); + mock_total_spin_locks++; +} + +void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) +{ + UNIT_HOOK("spin_lock"); + mock_record_locked(lock); + mock_total_spin_locks++; +} + +unsigned long _raw_spin_lock_irqsave(raw_spinlock_t *lock) +{ + UNIT_HOOK("spin_lock_irqsave"); + mock_record_locked(lock); + return 1234; } +void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, + struct lock_class_key *key, short inner) +{} + int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) { + UNIT_HOOK("spin_lock"); if (mock_check_error(&mock_trylock_errors)) return 0; - UNIT_HOOK("spin_lock"); - mock_active_locks++; + mock_record_locked(lock); + mock_total_spin_locks++; return 1; } +void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) +{ + UNIT_HOOK("unlock"); + mock_record_unlocked(lock); +} + void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) { - mock_active_locks--; + UNIT_HOOK("unlock"); + mock_record_unlocked(lock); +} + +void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) +{ + mock_record_unlocked(lock); +} + +void _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, + unsigned long flags) +{ + if (flags != 1234) + FAIL("incorrect flags %ld returned to %sa (expected 1234)", + flags, __func__); + mock_record_unlocked(lock); } int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) { + UNIT_HOOK("spin_lock"); if (mock_check_error(&mock_spin_lock_held)) return 0; - mock_active_locks++; + mock_record_locked(lock); return 1; } -void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t) {} - -struct ctl_table_header *register_net_sysctl(struct net *net, - const char *path, struct ctl_table *table) +bool rcu_is_watching(void) { - return NULL; + return true; } -void release_sock(struct sock *sk) +#ifdef CONFIG_DEBUG_LOCK_ALLOC +int rcu_read_lock_any_held(void) { - mock_active_locks--; - sk->sk_lock.owned = 0; + return 1; } -void remove_wait_queue(struct wait_queue_head *wq_head, - struct wait_queue_entry *wq_entry) {} +int rcu_read_lock_held(void) +{ + return 0; +} -void schedule(void) +int rcu_read_lock_bh_held(void) { - UNIT_HOOK("schedule"); + return 0; } +#endif -void security_sk_classify_flow(struct sock *sk, struct flowi_common *flic) {} +void __rcu_read_lock(void) +{} -void sk_common_release(struct sock *sk) {} +void __rcu_read_unlock(void) +{} -int sk_set_peek_off(struct sock *sk, int val) +bool rcuref_get_slowpath(rcuref_t *ref) { - return 0; + return true; } -int skb_copy_datagram_iter(const struct sk_buff *from, int offset, - struct iov_iter *iter, int size) +void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t) {} + +int register_pernet_subsys(struct pernet_operations *) +{ + return 0; +} + +int register_qdisc(struct Qdisc_ops *qops) +{ + if (mock_check_error(&mock_register_qdisc_errors)) + return -EINVAL; + registered_qdiscs++; + qdisc_ops = qops; + return 0; +} + +void release_sock(struct sock *sk) +{ + mock_active_locks--; + sk->sk_lock.owned = 0; +} + +void remove_wait_queue(struct wait_queue_head *wq_head, + struct wait_queue_entry *wq_entry) +{} + +int rtnl_is_locked(void) { + return 0; +} + +void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) +{ + if (!head || !tail) + return; + + while (true) { + struct sk_buff *next = head->next; + + __kfree_skb(head); + if (head == tail) + break; + head = next; + } +} + +void schedule(void) +{ + UNIT_HOOK("schedule"); +} + +signed long schedule_timeout(signed long timeout) +{ + UNIT_HOOK("schedule_timeout"); + + /* Result is time remaining in timeout. */ + return timeout - 1; +} + +int __SCT__cond_resched(void) +{ + return 0; +} + +int __SCT__might_resched(void) +{ + return 0; +} + +void __SCT__preempt_schedule(void) +{} + +void security_sk_classify_flow(const struct sock *sk, + struct flowi_common *flic) +{} + +void __show_free_areas(unsigned int filter, nodemask_t *nodemask, + int max_zone_idx) +{} + +void sk_common_release(struct sock *sk) +{} + +int sk_set_peek_off(struct sock *sk, int val) +{ + return 0; +} + +void sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason reason) +{ +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) + kfree_skb(skb); +#else + __kfree_skb(skb); +#endif +} + +int skb_copy_datagram_iter(const struct sk_buff *from, int offset, + struct iov_iter *iter, int size) { size_t bytes_left = size; if (mock_check_error(&mock_copy_data_errors)) return -EFAULT; if (bytes_left > iter->count) { - unit_log_printf("; ", "skb_copy_datagram_iter needs %lu bytes, " - "but iov_iter has only %lu", - bytes_left, iter->count); + unit_log_printf("; ", "%s needs %lu bytes, but iov_iter has only %lu", + __func__, bytes_left, iter->count); return 0; } while (bytes_left > 0) { - struct iovec *iov = (struct iovec *) iter->iov; - __u64 int_base = (__u64) iov->iov_base; + struct iovec *iov = (struct iovec *) iter_iov(iter); + u64 int_base = (u64) iov->iov_base; size_t chunk_bytes = iov->iov_len; + if (chunk_bytes > bytes_left) chunk_bytes = bytes_left; unit_log_printf("; ", - "skb_copy_datagram_iter: %lu bytes to 0x%llx: ", + "%s: %lu bytes to 0x%llx: ", __func__, chunk_bytes, int_base); unit_log_data(NULL, from->data + offset + size - bytes_left, chunk_bytes); @@ -904,7 +1459,7 @@ int skb_copy_datagram_iter(const struct sk_buff *from, int offset, iov->iov_base = (void *) (int_base + chunk_bytes); iov->iov_len -= chunk_bytes; if (iov->iov_len == 0) - iter->iov++; + iter->__iov++; } return 0; } @@ -914,39 +1469,58 @@ struct sk_buff *skb_dequeue(struct sk_buff_head *list) return __skb_dequeue(list); } +void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) +{} + void *skb_pull(struct sk_buff *skb, unsigned int len) { if ((skb_tail_pointer(skb) - skb->data) < len) - FAIL("sk_buff underflow during pull"); + FAIL(" sk_buff underflow during %s", __func__); skb->len -= len; return skb->data += len; } +void *skb_push(struct sk_buff *skb, unsigned int len) +{ + skb->data -= len; + skb->len += len; + if (unlikely(skb->data < skb->head)) + FAIL(" sk_buff underflow during %s", __func__); + return skb->data; +} + void *skb_put(struct sk_buff *skb, unsigned int len) { unsigned char *result = skb_tail_pointer(skb); + skb->tail += len; skb->len += len; return result; } +void skb_queue_purge_reason(struct sk_buff_head *list, + enum skb_drop_reason reason) +{ + while (skb_queue_len(list) > 0) + kfree_skb(__skb_dequeue(list)); +} + struct sk_buff *skb_segment(struct sk_buff *head_skb, netdev_features_t features) { - struct data_header h; - int offset, length; struct sk_buff *skb1, *skb2; + struct homa_data_hdr h; + int offset, length; /* Split the existing packet into two packets. */ memcpy(&h, skb_transport_header(head_skb), sizeof(h)); offset = ntohl(h.seg.offset); - length = ntohl(h.seg.segment_length); - h.seg.segment_length = htonl(length/2); - skb1 = mock_skb_new(&ipv6_hdr(head_skb)->saddr, &h.common, length/2, + length = homa_data_len(head_skb); + skb1 = mock_skb_alloc(&ipv6_hdr(head_skb)->saddr, &h.common, length/2, offset); offset += length/2; h.seg.offset = htonl(offset); - skb2 = mock_skb_new(&ipv6_hdr(head_skb)->saddr, &h.common, length/2, + skb2 = mock_skb_alloc(&ipv6_hdr(head_skb)->saddr, &h.common, length/2, offset); skb2->next = NULL; skb1->next = skb2; @@ -965,8 +1539,8 @@ int sock_common_setsockopt(struct socket *sock, int level, int optname, return 0; } -int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +int sock_no_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { return 0; } @@ -998,52 +1572,78 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2) return 0; } -void synchronize_sched(void) {} +void synchronize_rcu(void) +{} -void __tasklet_hi_schedule(struct tasklet_struct *t) {} +void __tasklet_hi_schedule(struct tasklet_struct *t) +{} void tasklet_init(struct tasklet_struct *t, - void (*func)(unsigned long), unsigned long data) {} + void (*func)(unsigned long), unsigned long data) +{} + +void tasklet_kill(struct tasklet_struct *t) +{} -void tasklet_kill(struct tasklet_struct *t) {} +void __trace_set_current_state(int state_value) +{} -void unregister_net_sysctl_table(struct ctl_table_header *header) {} +void unregister_net_sysctl_table(struct ctl_table_header *header) +{ + UNIT_LOG("; ", "unregister_net_sysctl_table"); +} + +void unregister_pernet_subsys(struct pernet_operations *) +{} + +void unregister_qdisc(struct Qdisc_ops *qops) +{ + registered_qdiscs--; + qdisc_ops = NULL; +} void vfree(const void *block) { if (!vmallocs_in_use || unit_hash_get(vmallocs_in_use, block) == NULL) { - FAIL("vfree on unknown block"); + FAIL(" %s on unknown block", __func__); return; } unit_hash_erase(vmallocs_in_use, block); free((void *) block); } -void *vmalloc(size_t size) +int vfs_fsync(struct file *file, int datasync) { - if (mock_check_error(&mock_vmalloc_errors)) - return NULL; - void *block = malloc(size); - if (!block) { - FAIL("malloc failed"); - return NULL; - } - if (!vmallocs_in_use) - vmallocs_in_use = unit_hash_new(); - unit_hash_set(vmallocs_in_use, block, "used"); - return block; + return 0; } void wait_for_completion(struct completion *x) {} -long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, +long wait_woken(struct wait_queue_entry *wq_entry, unsigned int mode, long timeout) { return 0; } -void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, - int nr_exclusive, void *key) {} +int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, + int nr_exclusive, void *key) +{ + if (!mock_log_wakeups) + return 0; + if (nr_exclusive == 1) + unit_log_printf("; ", "wake_up"); + else + unit_log_printf("; ", "wake_up_all"); + return 0; +} + +void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, + int nr) +{ + if (!mock_log_wakeups) + return; + unit_log_printf("; ", "wake_up_locked"); +} int wake_up_process(struct task_struct *tsk) { @@ -1053,12 +1653,50 @@ int wake_up_process(struct task_struct *tsk) void __warn_printk(const char *s, ...) {} -int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, +int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, int sync, void *key) { return 0; } +/** + * mock_alloc_pages() - Called instead of alloc_pages when Homa is compiled + * for unit testing. + */ +struct page *mock_alloc_pages(gfp_t gfp, unsigned int order) +{ + struct page *page; + + if (mock_check_error(&mock_alloc_page_errors)) + return NULL; + page = (struct page *)malloc(PAGE_SIZE << order); + if (!pages_in_use) + pages_in_use = unit_hash_new(); + unit_hash_set(pages_in_use, page, (char *)1); + return page; +} + +#ifndef __STRIP__ /* See strip.py */ +/** + * mock_alloc_qdisc() - Allocate and initialize a new Qdisc suitable for + * use in unit tests as a homa qdisc. + * Return: The new Qdisc. The memory is dynamically allocated and must + * be kfree-d by the caller. homa_qdisc_init has not been invoked on + * this Qdisc yet. + */ +struct Qdisc *mock_alloc_qdisc(struct netdev_queue *dev_queue) +{ + struct Qdisc *qdisc; + + qdisc = kzalloc(sizeof(struct Qdisc) + sizeof(struct homa_qdisc), + GFP_ATOMIC); + qdisc->dev_queue = dev_queue; + qdisc->ops = qdisc_ops; + spin_lock_init(&qdisc->q.lock); + return qdisc; +} +#endif /* See strip.py */ + /** * mock_check_error() - Determines whether a method should simulate an error * return. @@ -1075,16 +1713,55 @@ int mock_check_error(int *errorMask) return result; } +/** + * mock_cmpxchg() - Replacement for atomic64_cmpxchg_relaxed. + */ +s64 mock_cmpxchg(atomic64_t *target, s64 old, s64 new) +{ + if (mock_check_error(&mock_cmpxchg_errors)) + return old+1; + atomic64_set(target, new); + return old; +} + /** * mock_clear_xmit_prios() - Remove all information from the list of * transmit priorities. */ -void mock_clear_xmit_prios() +void mock_clear_xmit_prios(void) { mock_xmit_prios_offset = 0; mock_xmit_prios[0] = 0; } +#ifndef __STRIP__ /* See strip.py */ +/** + * mock_compound_order() - Replacement for compound_order function. + */ +unsigned int mock_compound_order(struct page *page) +{ + unsigned int result; + + if (mock_compound_order_mask & 1) + result = 0; + else + result = HOMA_SKB_PAGE_ORDER; + mock_compound_order_mask >>= 1; + return result; +} +#endif /* See strip.py */ + +/** + * mock_cpu_to_node() - Replaces cpu_to_node to determine NUMA node for + * a CPU. + */ +int mock_cpu_to_node(int core) +{ + if (mock_numa_mask & (1<sk_data_ready; logs a message * to indicate that it was invoked. @@ -1096,17 +1773,56 @@ void mock_data_ready(struct sock *sk) } /** - * mock_get_cycles() - Replacement for get_cycles; allows time to be - * hard-while using mock_cycles variable. + * mock_dev() - Return a net_device suitable for use in unit tests. + * @index: Index of the desired device among all those available; must + * be < MOCK_MAX_NETS. + * @homa: struct homa that the device will be associated with; may be + * needed for hnet initialization. + * Return: The specified net_device. If this is the first call for @index + * in this unit test, the device will be initialized. It will be + * associated with mock_hnet(index), which will also be initialized + * if it wasn't already initialized. */ -cycles_t mock_get_cycles(void) +struct net_device *mock_dev(int index, struct homa *homa) { - if (mock_cycles == ~0) { - uint32_t lo, hi; - __asm__ __volatile__("rdtsc" : "=a" (lo), "=d" (hi)); - return (((uint64_t)hi << 32) | lo); + struct net_device *dev; + + if (index >= MOCK_MAX_NETS) { + FAIL("Index %d exceeds maximum number of network namespaces (%d)", + index, MOCK_MAX_NETS); + index = 0; + } + dev = &mock_devices[index]; + if (!dev->ethtool_ops) { + dev->gso_max_segs = 1000; + dev->gso_max_size = mock_mtu; + dev->_tx = &mock_net_queue; + dev->nd_net.net = &mock_nets[0]; + dev->ethtool_ops = &mock_ethtool_ops; + mock_hnet(index, homa); } - return mock_cycles; + return dev; +} + +struct dst_entry *mock_dst_check(struct dst_entry *dst, __u32 cookie) +{ + if (mock_check_error(&mock_dst_check_errors)) + return NULL; + return dst; +} + +/** + * mock_get_clock() - Replacement for homa_clock; allows time to be + * controlled by unit tests. + */ +u64 mock_get_clock(void) +{ + if (mock_next_clock_val < mock_num_clock_vals) { + mock_next_clock_val++; + return mock_clock_vals[mock_next_clock_val - 1]; + } + mock_clock += mock_clock_tick; + return mock_clock; } /** @@ -1119,111 +1835,464 @@ unsigned int mock_get_mtu(const struct dst_entry *dst) return mock_mtu; } +void mock_get_page(struct page *page) +{ + int64_t ref_count = (int64_t) unit_hash_get(pages_in_use, page); + + if (ref_count == 0) + FAIL(" unallocated page passed to %s", __func__); + else + unit_hash_set(pages_in_use, page, (void *) (ref_count+1)); +} + +int mock_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *settings) +{ + if (mock_check_error(&mock_ethtool_ksettings_errors)) + return -EOPNOTSUPP; + memset(settings, 0, sizeof(*settings)); + settings->base.speed = mock_link_mbps; + return 0; +} + /** - * mock_rcu_read_lock() - Called instead of rcu_read_lock when Homa is compiled - * for unit testing. + * mock_hnet() - Return a struct homa_net suitable for use in tests. + * @index: Index of this homa_net among those available for unit tests (must + * be < MOCK_MAX_NETS) + * @homa: struct homa that the homa_net will be associated with. + * Return: The requested homa_net. If this is the first time that @index + * has been specified during this unit test, the hnet will be + * initialized. */ -void mock_rcu_read_lock(void) +struct homa_net *mock_hnet(int index, struct homa *homa) { - mock_active_rcu_locks++; + struct homa_net *hnet; + + if (index >= MOCK_MAX_NETS) { + FAIL("Index %d exceeds maximum number of network namespaces (%d)", + index, MOCK_MAX_NETS); + index = 0; + } + hnet = mock_hnets[index]; + if (!hnet) { + hnet = malloc(sizeof(*hnet)); + mock_hnets[index] = hnet; + homa_net_init(hnet, &mock_nets[index], homa); + if (index == 0) + mock_dev(0, homa); + } + return hnet; } /** - * mock_rcu_read_unlock() - Called instead of rcu_read_unlock when Homa is - * compiled for unit testing. + * mock_net_for_hnet() - Return the struct net associated with a struct + * homa_net, or NULL if the struct net can't be identified. + * @hnet: Find the struct net associated with this. + * Return: See above. */ -void mock_rcu_read_unlock(void) +struct net *mock_net_for_hnet(struct homa_net *hnet) { - if (mock_active_rcu_locks == 0) - FAIL(" rcu_read_unlock called without rcu_read_lock"); - mock_active_rcu_locks--; + int i; + + for (i = 0; i < MOCK_MAX_NETS; i++) { + if (hnet == mock_hnets[i]) + return &mock_nets[i]; + } + return NULL; +} + +void *mock_net_generic(const struct net *net, unsigned int id) +{ + int i; + + if (id != homa_net_id) + return NULL; + for (i = 0; i < MOCK_MAX_NETS; i++) { + if (net == &mock_nets[i]) + return mock_hnets[i]; + } + return NULL; } /** - * mock_skb_new() - Allocate and return a packet buffer. The buffer is - * initialized as if it just arrived from the network. + * mock_page_refs() - Returns current reference count for page (0 if no + * such page exists). + */ +int mock_page_refs(struct page *page) +{ + return (int64_t) unit_hash_get(pages_in_use, page); +} + +/** + * mock_page_to_nid() - Replacement for page_to_nid function. + */ +int mock_page_to_nid(struct page *page) +{ + int result; + + if (mock_page_nid_mask & 1) + result = 1; + else + result = 0; + mock_page_nid_mask >>= 1; + return result; +} + +void mock_preempt_disable() +{ + mock_preempt_disables++; +} + +void mock_preempt_enable() +{ + if (mock_preempt_disables == 0) + FAIL(" preempt_enable invoked without preempt_disable"); + mock_preempt_disables--; +} + +int mock_processor_id() +{ + return cpu_number; +} + +void mock_put_page(struct page *page) +{ + int64_t ref_count = (int64_t) unit_hash_get(pages_in_use, page); + + if (ref_count == 0) + FAIL(" unallocated page passed to %s", __func__); + else { + ref_count--; + if (ref_count == 0) { + unit_hash_erase(pages_in_use, page); + free(page); + } else { + unit_hash_set(pages_in_use, page, (void *) ref_count); + } + } +} + +/** + * mock_raw_skb() - Performs most of the work of mock_skb_alloc and + * mock_tcp_skb. Allocates and initializes an skb. * @saddr: IPv6 address to use as the sender of the packet, in * network byte order. - * @h: Header for the buffer; actual length and contents depend - * on the type. - * @extra_bytes: How much additional data to add to the buffer after - * the header. - * @first_value: Determines the data contents: the first __u32 will have - * this value, and each successive __u32 will increment by 4. - * - * Return: A packet buffer containing the information described above. - * The caller owns this buffer and is responsible for freeing it. + * @protocol: Protocol to use in the IP header, such as IPPROTO_HOMA. + * @length: How many bytes of space to allocated after the IP header. + * Return: The new packet buffer, initialized as if the packet just + * arrived from the network and is about to be processed at + * transport level (e.g. there will be an IP header before + * skb->tail). The skb has room for @length additional bytes, + * but they have not yet been allocated with skb_put(). The + * caller must eventually free the skb. */ -struct sk_buff *mock_skb_new(struct in6_addr *saddr, struct common_header *h, - int extra_bytes, int first_value) +struct sk_buff *mock_raw_skb(struct in6_addr *saddr, int protocol, int length) { - int header_size, ip_size, data_size, shinfo_size; - unsigned char *p; + int ip_size, data_size, shinfo_size; + struct sk_buff *skb; - switch (h->type) { - case DATA: - header_size = sizeof(struct data_header); - break; - case GRANT: - header_size = sizeof(struct grant_header); - break; - case RESEND: - header_size = sizeof(struct resend_header); - break; - case UNKNOWN: - header_size = sizeof(struct unknown_header); - break; - case BUSY: - header_size = sizeof(struct busy_header); - break; - case CUTOFFS: - header_size = sizeof(struct cutoffs_header); - break; - case FREEZE: - header_size = sizeof(struct freeze_header); - break; - case NEED_ACK: - header_size = sizeof(struct need_ack_header); - break; - case ACK: - header_size = sizeof(struct ack_header); - break; - default: - header_size = sizeof(struct common_header); - break; - } - struct sk_buff *skb = malloc(sizeof(struct sk_buff)); + /* Don't let the IP header start at the beginning of the packet + * buffer: that will confuse is_homa_pkt. + */ +#define IP_HDR_OFFSET 4 + + skb = malloc(sizeof(struct sk_buff)); memset(skb, 0, sizeof(*skb)); - if (!buffs_in_use) - buffs_in_use = unit_hash_new(); - unit_hash_set(buffs_in_use, skb, "used"); + if (!skbs_in_use) + skbs_in_use = unit_hash_new(); + unit_hash_set(skbs_in_use, skb, "used"); ip_size = mock_ipv6 ? sizeof(struct ipv6hdr) : sizeof(struct iphdr); - data_size = SKB_DATA_ALIGN(ip_size + header_size + extra_bytes); + data_size = SKB_DATA_ALIGN(IP_HDR_OFFSET + ip_size + length); shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); skb->head = malloc(data_size + shinfo_size); memset(skb->head, 0, data_size + shinfo_size); skb->data = skb->head; skb_reset_tail_pointer(skb); skb->end = skb->tail + data_size; - skb_reserve(skb, ip_size); + + /* Don't want IP header starting at the beginning of the packet + * buffer (will confuse is_homa_pkt). + */ + skb_reserve(skb, IP_HDR_OFFSET + ip_size); skb_reset_transport_header(skb); - p = skb_put(skb, header_size); - memcpy(skb->data, h, header_size); - p = skb_put(skb, extra_bytes); - unit_fill_data(p, extra_bytes, first_value); - skb->users.refs.counter = 1; + skb_reset_network_header(skb); + skb_set_network_header(skb, -ip_size); if (mock_ipv6) { ipv6_hdr(skb)->version = 6; ipv6_hdr(skb)->saddr = *saddr; - ipv6_hdr(skb)->nexthdr = IPPROTO_HOMA; + ipv6_hdr(skb)->nexthdr = protocol; + skb->protocol = htons(ETH_P_IPV6); } else { ip_hdr(skb)->version = 4; ip_hdr(skb)->saddr = saddr->in6_u.u6_addr32[3]; - ip_hdr(skb)->protocol = IPPROTO_HOMA; + ip_hdr(skb)->protocol = protocol; + ip_hdr(skb)->check = 0; + skb->protocol = htons(ETH_P_IP); } + skb->users.refs.counter = 1; skb->_skb_refdst = 0; skb->hash = 3; + skb->next = NULL; + skb->dev = &mock_devices[0]; + skb_set_queue_mapping(skb, mock_queue_index); + qdisc_skb_cb(skb)->pkt_len = length + 100; + return skb; +} + +/** + * mock_rcu_read_lock() - Called instead of rcu_read_lock when Homa is compiled + * for unit testing. + */ +void mock_rcu_read_lock(void) +{ + mock_active_rcu_locks++; +} + +/** + * mock_rcu_read_unlock() - Called instead of rcu_read_unlock when Homa is + * compiled for unit testing. + */ +void mock_rcu_read_unlock(void) +{ + if (mock_active_rcu_locks == 0) + FAIL(" rcu_read_unlock called without rcu_read_lock"); + mock_active_rcu_locks--; +} + +void mock_record_locked(void *lock) +{ + if (!spinlocks_held) + spinlocks_held = unit_hash_new(); + if (unit_hash_get(spinlocks_held, lock) != NULL) + FAIL(" locking lock 0x%p when already locked", lock); + else + unit_hash_set(spinlocks_held, lock, "locked"); +} + +void mock_record_unlocked(void *lock) +{ + if (!spinlocks_held || unit_hash_get(spinlocks_held, lock) == NULL) { + FAIL(" unlocking lock 0x%p that isn't locked", lock); + return; + } + unit_hash_erase(spinlocks_held, lock); +} + +/** + * mock_register_net_sysctl() - Called instead of register_net_sysctl + * when Homa is compiled for unit testing. + */ +struct ctl_table_header *mock_register_net_sysctl(struct net *net, + const char *path, struct ctl_table *table) +{ + if (mock_check_error(&mock_register_sysctl_errors)) + return NULL; + return (struct ctl_table_header *)11111; +} + +int mock_rht_init(struct rhashtable *ht, + const struct rhashtable_params *params) +{ + if (mock_check_error(&mock_rht_init_errors)) + return -EINVAL; + return rhashtable_init(ht, params); +} + +void *mock_rht_lookup_get_insert_fast(struct rhashtable *ht, + struct rhash_head *obj, + const struct rhashtable_params params) +{ + if (mock_check_error(&mock_rht_insert_errors)) + return ERR_PTR(-EINVAL); + return rhashtable_lookup_get_insert_fast(ht, obj, params); +} + +void *mock_rht_walk_next(struct rhashtable_iter *iter) +{ + void *result; + + if (!mock_rht_walk_results) + return rhashtable_walk_next(iter); + if (mock_rht_num_walk_results == 0) + return NULL; + result = *mock_rht_walk_results; + mock_rht_walk_results++; + mock_rht_num_walk_results--; + return result; +} + +void mock_rpc_hold(struct homa_rpc *rpc) +{ + mock_rpc_holds++; + refcount_inc(&rpc->refs); +} + +void mock_rpc_put(struct homa_rpc *rpc) +{ + if (refcount_read(&rpc->refs) < 2) + FAIL("homa_rpc_put invoked when RPC has no active holds"); + mock_rpc_holds--; + refcount_dec(&rpc->refs); +} + +/** + * mock_set_clock_vals() - Specify one or more clock values to be returned + * by the next calls to homa_clock(). The list of arguments must be + * terminated by a zero value (which will not be used as a clock value). + * @t: The first clock reading to return. + */ +void mock_set_clock_vals(u64 t, ...) +{ + va_list args; + + mock_clock_vals[0] = t; + mock_num_clock_vals = 1; + va_start(args, t); + while (mock_num_clock_vals < MAX_CLOCK_VALS) { + u64 time = va_arg(args, u64); + + if (time == 0) + break; + mock_clock_vals[mock_num_clock_vals] = time; + mock_num_clock_vals++; + } + va_end(args); + mock_next_clock_val = 0; +} + +/** + * mock_set_core() - Set internal state that indicates the "current core". + * @num: Integer identifier for a core. + */ +void mock_set_core(int num) +{ + cpu_number = num; +} + +/** + * mock_set_ipv6() - Invoked by some tests to make them work when tests + * are run with --ipv4. Changes the socket to an IPv6 socket and sets + * mock_mtu and mock_ipv6. + * @hsk: Socket to reset for IPv6, if it's currently set for IPv4. + */ +void mock_set_ipv6(struct homa_sock *hsk) +{ + mock_ipv6 = true; + mock_mtu -= hsk->ip_header_length - sizeof(struct ipv6hdr); + hsk->ip_header_length = sizeof(struct ipv6hdr); + hsk->sock.sk_family = AF_INET6; +} + +/** + * mock_skb_alloc() - Allocate and return a Homa packet buffer. The buffer is + * initialized as if it just arrived from the network. + * @saddr: IPv6 address to use as the sender of the packet, in + * network byte order. + * @h: Header for the buffer; actual length and contents depend + * on the type. If NULL then no Homa header is added; + * extra_bytes of total space will be allocated for the + * skb, initialized to zero. + * @extra_bytes: How much additional data to add to the buffer after + * the header. + * @first_value: Determines the data contents: the first u32 will have + * this value, and each successive u32 will increment by 4. + * + * Return: A packet buffer containing the information described above. + * The caller owns this buffer and is responsible for freeing it. + */ +struct sk_buff *mock_skb_alloc(struct in6_addr *saddr, + struct homa_common_hdr *h, int extra_bytes, + int first_value) +{ + struct sk_buff *skb; + unsigned char *p; + int header_size; + + /* Don't let the IP header start at the beginning of the packet + * buffer: that will confuse is_homa_pkt. + */ +#define IP_HDR_OFFSET 4 + + if (h) { + switch (h->type) { + case DATA: + header_size = sizeof(struct homa_data_hdr); + break; +#ifndef __STRIP__ /* See strip.py */ + case GRANT: + header_size = sizeof(struct homa_grant_hdr); + break; +#endif /* See strip.py */ + case RESEND: + header_size = sizeof(struct homa_resend_hdr); + break; + case RPC_UNKNOWN: + header_size = sizeof(struct homa_rpc_unknown_hdr); + break; + case BUSY: + header_size = sizeof(struct homa_busy_hdr); + break; +#ifndef __STRIP__ /* See strip.py */ + case CUTOFFS: + header_size = sizeof(struct homa_cutoffs_hdr); + break; + case FREEZE: + header_size = sizeof(struct homa_freeze_hdr); + break; +#endif /* See strip.py */ + case NEED_ACK: + header_size = sizeof(struct homa_need_ack_hdr); + break; + case ACK: + header_size = sizeof(struct homa_ack_hdr); + break; + default: + header_size = sizeof(struct homa_common_hdr); + break; + } + } else { + header_size = 0; + } + skb = mock_raw_skb(saddr, IPPROTO_HOMA, header_size + extra_bytes); + p = skb_transport_header(skb); + if (header_size != 0) { + p = skb_put(skb, header_size); + memcpy(p, h, header_size); + } + if (h && extra_bytes != 0) { + p = skb_put(skb, extra_bytes); + unit_fill_data(p, extra_bytes, first_value); + } + qdisc_skb_cb(skb)->pkt_len = extra_bytes + 100; + return skb; +} + +/** + * mock_tcp_skb() - Allocate and return a TCP packet buffer. The buffer is + * initialized as if it just arrived from the network. + * @saddr: IPv6 address to use as the sender of the packet, in + * network byte order. + * @sequence: Sequence number to store in the TCP header. + * @extra_bytes: How much additional data to add to the buffer after + * the TCP header. + * + * Return: A packet buffer containing the information described above. + * The caller owns this buffer and is responsible for freeing it. + */ +struct sk_buff *mock_tcp_skb(struct in6_addr *saddr, int sequence, + int extra_bytes) +{ + struct sk_buff *skb; + struct tcphdr *tcp; + + skb = mock_raw_skb(saddr, IPPROTO_TCP, + sizeof(struct tcphdr) + extra_bytes); + tcp = (struct tcphdr *)skb_put(skb, sizeof(struct tcphdr)); + tcp->seq = htonl(sequence); + tcp->doff = sizeof(struct tcphdr) / 4; + skb_put(skb, extra_bytes); + qdisc_skb_cb(skb)->pkt_len = extra_bytes + 100; return skb; } @@ -1232,46 +2301,77 @@ struct sk_buff *mock_skb_new(struct in6_addr *saddr, struct common_header *h, */ int mock_skb_count(void) { - return unit_hash_size(buffs_in_use); + return unit_hash_size(skbs_in_use); +} + +void mock_sock_hold(struct sock *sk) +{ + mock_sock_holds++; +} + +void mock_sock_put(struct sock *sk) +{ + if (mock_sock_holds == 0) + FAIL("sock_put invoked when there were no active sock_holds"); + mock_sock_holds--; } /** * mock_sock_init() - Constructor for sockets; initializes the Homa-specific * part, and mocks out the non-Homa-specific parts. - * @hsk: Storage area to be initialized.\ - * @homa: Overall information about the Homa protocol. + * @hsk: Storage area to be initialized. + * @hnet: Network namesspace for the socket. * @port: Port number to use for the socket, or 0 to * use default. + * Return: 0 for success, otherwise a negative errno. */ -void mock_sock_init(struct homa_sock *hsk, struct homa *homa, int port) +int mock_sock_init(struct homa_sock *hsk, struct homa_net *hnet, int port) { static struct ipv6_pinfo hsk_pinfo; struct sock *sk = &hsk->sock; - int saved_port = homa->next_client_port; + int saved_port; + int err = 0; + + saved_port = hnet->prev_default_port; memset(hsk, 0, sizeof(*hsk)); sk->sk_data_ready = mock_data_ready; sk->sk_family = mock_ipv6 ? AF_INET6 : AF_INET; - if ((port != 0) && (port >= HOMA_MIN_DEFAULT_PORT)) - homa->next_client_port = port; - homa_sock_init(hsk, homa); + sk->sk_socket = &mock_socket; + memset(&mock_socket, 0, sizeof(mock_socket)); + mock_socket.sk = sk; + sk->sk_net.net = mock_net_for_hnet(hnet); + refcount_set(&sk->sk_wmem_alloc, 1); + init_waitqueue_head(&mock_socket.wq.wait); + rcu_assign_pointer(sk->sk_wq, &mock_socket.wq); + sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + if (port != 0 && port >= mock_min_default_port) + hnet->prev_default_port = port - 1; + err = homa_sock_init(hsk); + hsk->is_server = true; if (port != 0) - homa->next_client_port = saved_port; - if (port < HOMA_MIN_DEFAULT_PORT) - homa_sock_bind(&homa->port_map, hsk, port); + hnet->prev_default_port = saved_port; + if (err != 0) + return err; + if (port != 0 && port < mock_min_default_port) + homa_sock_bind(hnet, hsk, port); hsk->inet.pinet6 = &hsk_pinfo; mock_mtu = UNIT_TEST_DATA_PER_PACKET + hsk->ip_header_length - + sizeof(struct data_header); - mock_net_device.gso_max_size = mock_mtu; + + sizeof(struct homa_data_hdr); + mock_devices[0].gso_max_size = mock_mtu; + err = homa_pool_set_region(hsk, (void *) 0x1000000, + 100*HOMA_BPAGE_SIZE); + return err; } /** * mock_spin_unlock() - Called instead of spin_unlock when Homa is compiled * for unit testing. - * @lock: Lock to be be released (ignored). + * @lock: Lock to be released (ignored). */ void mock_spin_unlock(spinlock_t *lock) { - mock_active_locks--; + UNIT_HOOK("unlock"); + mock_record_unlocked(lock); } /** @@ -1282,21 +2382,40 @@ void mock_spin_unlock(spinlock_t *lock) */ void mock_teardown(void) { + int count, i; + cpu_number = 1; - cpu_khz = 1000000; + current_task = &mock_task; + mock_alloc_page_errors = 0; mock_alloc_skb_errors = 0; + mock_cmpxchg_errors = 0; mock_copy_data_errors = 0; mock_copy_to_iter_errors = 0; mock_copy_to_user_errors = 0; mock_cpu_idle = 0; - mock_cycles = 0; + mock_clock = 0; + mock_clock = 0; + mock_clock_tick = 0; + mock_next_clock_val = 0; + mock_num_clock_vals = 0; + mock_tt_cycles = 0; + mock_ethtool_ksettings_errors = 0; + mock_exit_thread = false; mock_ipv6 = mock_ipv6_default; - mock_import_single_range_errors = 0; + mock_dst_check_errors = 0; + mock_import_ubuf_errors = 0; mock_import_iovec_errors = 0; mock_ip6_xmit_errors = 0; mock_ip_queue_xmit_errors = 0; mock_kmalloc_errors = 0; - mock_max_grants = 10; + mock_kthread_create_errors = 0; + mock_prepare_to_wait_errors = 0; + mock_register_protosw_errors = 0; + mock_register_qdisc_errors = 0; + mock_register_sysctl_errors = 0; + mock_rht_init_errors = 0; + mock_rht_insert_errors = 0; + mock_wait_intr_irq_errors = 0; mock_copy_to_user_dont_copy = 0; mock_bpage_size = 0x10000; mock_bpage_shift = 16; @@ -1307,16 +2426,50 @@ void mock_teardown(void) mock_trylock_errors = 0; mock_vmalloc_errors = 0; memset(&mock_task, 0, sizeof(mock_task)); + mock_prepare_to_wait_status = -ERESTARTSYS; mock_signal_pending = 0; mock_xmit_log_verbose = 0; + mock_log_wakeups = 0; mock_mtu = 0; - mock_net_device.gso_max_size = 0; - - int count = unit_hash_size(buffs_in_use); + mock_max_skb_frags = MAX_SKB_FRAGS; + mock_numa_mask = 5; + mock_compound_order_mask = 0; + mock_page_nid_mask = 0; + mock_printk_output[0] = 0; + mock_rht_walk_results = NULL; + mock_rht_num_walk_results = 0; + mock_min_default_port = 0x8000; + homa_net_id = 0; + for (i = 0; i < MOCK_MAX_NETS; i++) { + if (mock_hnets[i]) { + free(mock_hnets[i]); + mock_hnets[i] = NULL; + } + } + memset(mock_devices, 0, sizeof(mock_devices)); + mock_peer_free_no_fail = 0; + mock_link_mbps = 10000; + memset(&mock_net_queue, 0, sizeof(mock_net_queue)); + mock_queue_index = 0; + mock_netif_schedule_calls = 0; + memset(inet_offloads, 0, sizeof(inet_offloads)); + inet_offloads[IPPROTO_TCP] = (struct net_offload __rcu *) &tcp_offload; + memset(inet6_offloads, 0, sizeof(inet6_offloads)); + inet6_offloads[IPPROTO_TCP] = (struct net_offload __rcu *) + &tcp_v6_offload; + jiffies = 1100; + + count = unit_hash_size(skbs_in_use); if (count > 0) FAIL(" %u sk_buff(s) still in use after test", count); - unit_hash_free(buffs_in_use); - buffs_in_use = NULL; + unit_hash_free(skbs_in_use); + skbs_in_use = NULL; + + count = unit_hash_size(spinlocks_held); + if (count > 0) + FAIL(" %u spinlocks still held after test", count); + unit_hash_free(spinlocks_held); + spinlocks_held = NULL; count = unit_hash_size(kmallocs_in_use); if (count > 0) @@ -1324,6 +2477,18 @@ void mock_teardown(void) unit_hash_free(kmallocs_in_use); kmallocs_in_use = NULL; + count = unit_hash_size(pages_in_use); + if (count > 0) + FAIL(" %u pages still allocated after test", count); + unit_hash_free(pages_in_use); + pages_in_use = NULL; + + if (registered_qdiscs != 0) + FAIL(" %d qdiscs still registered after test", + registered_qdiscs); + registered_qdiscs = 0; + qdisc_ops = NULL; + count = unit_hash_size(proc_files_in_use); if (count > 0) FAIL(" %u proc file(s) still allocated after test", count); @@ -1343,13 +2508,57 @@ void mock_teardown(void) vmallocs_in_use = NULL; if (mock_active_locks != 0) - FAIL(" %d locks still locked after test", mock_active_locks); + FAIL(" %d (non-spin) locks still locked after test", + mock_active_locks); mock_active_locks = 0; + mock_total_spin_locks = 0; if (mock_active_rcu_locks != 0) FAIL(" %d rcu_read_locks still active after test", mock_active_rcu_locks); mock_active_rcu_locks = 0; + if (mock_sock_holds != 0) + FAIL(" %d sock_holds still active after test", + mock_sock_holds); + mock_sock_holds = 0; + + if (mock_rpc_holds != 0) + FAIL(" %d homa_rpc_holds still active after test", + mock_rpc_holds); + mock_rpc_holds = 0; + + if (mock_preempt_disables != 0) + FAIL(" %d preempt_disables still active after test", + mock_preempt_disables); + mock_preempt_disables = 0; + +#ifndef __STRIP__ /* See strip.py */ + memset(homa_metrics, 0, sizeof(homa_metrics)); +#endif /* See strip.py */ + unit_hook_clear(); } + +/** + * mock_vmalloc() - Called instead of vmalloc when Homa is compiled + * for unit testing. + * @size: Number of bytes to allocate. + */ +void *mock_vmalloc(size_t size) +{ + void *block; + + UNIT_HOOK("kmalloc"); + if (mock_check_error(&mock_vmalloc_errors)) + return NULL; + block = malloc(size); + if (!block) { + FAIL(" malloc failed"); + return NULL; + } + if (!vmallocs_in_use) + vmallocs_in_use = unit_hash_new(); + unit_hash_set(vmallocs_in_use, block, "used"); + return block; +} diff --git a/test/mock.h b/test/mock.h index fccaf3de..82aaa10e 100644 --- a/test/mock.h +++ b/test/mock.h @@ -1,66 +1,257 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2019-2022 Homa Developers + * SPDX-License-Identifier: BSD-1-Clause */ +#ifndef _HOMA_MOCK_H +#define _HOMA_MOCK_H -/* Functions for mocking that are exported to test code. */ +#include -extern int cpu_number; +/* Replace various Linux variables and functions with mocked ones. */ +#undef alloc_pages +#define alloc_pages mock_alloc_pages + +#define atomic64_cmpxchg_relaxed mock_cmpxchg + +#undef alloc_percpu_gfp +#define alloc_percpu_gfp(type, flags) mock_kmalloc(10 * sizeof(type), flags) + +#define compound_order mock_compound_order + +#ifdef cpu_to_node +#undef cpu_to_node +#endif +#define cpu_to_node mock_cpu_to_node + +#undef current +#define current current_task + +#undef DECLARE_PER_CPU +#define DECLARE_PER_CPU(type, name) extern type name[10] + +#undef debug_smp_processor_id +#define debug_smp_processor_id() (pcpu_hot.cpu_number) + +#undef DEFINE_PER_CPU +#define DEFINE_PER_CPU(type, name) type name[10] + +#undef free_percpu +#define free_percpu(name) kfree(name) + +#define get_page mock_get_page + +#undef HOMA_MIN_DEFAULT_PORT +#define HOMA_MIN_DEFAULT_PORT mock_min_default_port + +#define homa_rpc_hold mock_rpc_hold + +#define homa_rpc_put mock_rpc_put + +#undef kmalloc +#define kmalloc mock_kmalloc + +#undef kmalloc_array +#define kmalloc_array(count, size, type) mock_kmalloc((count) * (size), type) + +#define kthread_complete_and_exit(...) + +#undef local_irq_save +#define local_irq_save(flags) (flags) = 0 + +#define net_generic(net, id) mock_net_generic(net, id) + +#ifdef page_address +#undef page_address +#endif +#define page_address(page) ((void *)page) + +#define page_ref_count mock_page_refs + +#define page_to_nid mock_page_to_nid + +#undef per_cpu +#define per_cpu(name, core) (name[core]) + +#undef per_cpu_ptr +#define per_cpu_ptr(name, core) (&name[core]) + +#undef preempt_disable +#define preempt_disable() mock_preempt_disable() + +#undef preempt_enable +#define preempt_enable() mock_preempt_enable() + +#define put_page mock_put_page + +#define rcu_read_lock mock_rcu_read_lock + +#define rcu_read_lock_bh mock_rcu_read_lock + +#define rcu_read_unlock mock_rcu_read_unlock + +#define rcu_read_unlock_bh mock_rcu_read_unlock + +#undef register_net_sysctl +#define register_net_sysctl mock_register_net_sysctl + +#define signal_pending(...) mock_signal_pending + +#undef smp_processor_id +#define smp_processor_id() mock_processor_id() + +#define sock_hold(sock) mock_sock_hold(sock) + +#define sock_put(sock) mock_sock_put(sock) + +#define spin_unlock mock_spin_unlock + +#undef this_cpu_ptr +#define this_cpu_ptr(name) (&name[cpu_number]) + +#undef __this_cpu_read +#define __this_cpu_read(name) (name) + +#undef vmalloc +#define vmalloc mock_vmalloc + +/* Forward references: */ +struct homa; +struct homa_rpc; +struct homa_sock; +struct homa_socktab; + +/* Variables and functions for mocking that are exported to test code. */ +extern int mock_alloc_page_errors; extern int mock_alloc_skb_errors; -extern int mock_bpage_size; -extern int mock_bpage_shift; +extern int mock_bpage_size; +extern int mock_bpage_shift; +extern u64 mock_clock; +extern u64 mock_clock_tick; +extern int mock_cmpxchg_errors; +extern int mock_compound_order_mask; extern int mock_copy_data_errors; extern int mock_copy_to_user_dont_copy; extern int mock_copy_to_user_errors; extern int mock_cpu_idle; -extern cycles_t mock_cycles; +extern int mock_dst_check_errors; +extern int mock_ethtool_ksettings_errors; +extern bool mock_exit_thread; extern int mock_import_iovec_errors; -extern int mock_import_single_range_errors; +extern int mock_import_ubuf_errors; extern int mock_ip6_xmit_errors; extern int mock_ip_queue_xmit_errors; extern bool mock_ipv6; extern bool mock_ipv6_default; extern int mock_kmalloc_errors; +extern int mock_kthread_create_errors; +extern int mock_link_mbps; +extern int mock_netif_schedule_calls; +extern int mock_prepare_to_wait_errors; +extern int mock_register_protosw_errors; +extern int mock_register_qdisc_errors; +extern int mock_register_sysctl_errors; +extern int mock_wait_intr_irq_errors; extern char mock_xmit_prios[]; +extern int mock_log_wakeups; extern int mock_log_rcu_sched; extern int mock_max_grants; +extern int mock_max_skb_frags; +extern __u16 mock_min_default_port; extern int mock_mtu; -extern struct net_device - mock_net_device; +extern struct netdev_queue + mock_net_queue; +extern struct net mock_nets[]; +extern int mock_numa_mask; +extern int mock_page_nid_mask; +extern int mock_peer_free_no_fail; +extern int mock_prepare_to_wait_status; +extern char mock_printk_output[]; +extern int mock_queue_index; +extern int mock_rht_init_errors; +extern int mock_rht_insert_errors; +extern void **mock_rht_walk_results; +extern int mock_rht_num_walk_results; extern int mock_route_errors; +extern int mock_signal_pending; +extern int mock_sock_holds; extern int mock_spin_lock_held; extern struct task_struct mock_task; +extern int mock_total_spin_locks; extern int mock_trylock_errors; +extern u64 mock_tt_cycles; extern int mock_vmalloc_errors; extern int mock_xmit_log_verbose; -extern int mock_check_error(int *errorMask); -extern void mock_clear_xmit_prios(void); -extern void mock_data_ready(struct sock *sk); -extern cycles_t mock_get_cycles(void); -extern unsigned int - mock_get_mtu(const struct dst_entry *dst); -extern void mock_rcu_read_lock(void); -extern void mock_rcu_read_unlock(void); -extern void mock_spin_lock(spinlock_t *lock); -extern void mock_spin_unlock(spinlock_t *lock); -extern int mock_skb_count(void); -extern struct sk_buff * - mock_skb_new(struct in6_addr *saddr, struct common_header *h, - int extra_bytes, int first_value); -extern void mock_sock_destroy(struct homa_sock *hsk, - struct homa_socktab *socktab); -extern void mock_sock_init(struct homa_sock *hsk, struct homa *homa, - int port); -extern void mock_teardown(void); +extern struct task_struct *current_task; + +struct page * + mock_alloc_pages(gfp_t gfp, unsigned order); +struct Qdisc + *mock_alloc_qdisc(struct netdev_queue *dev_queue); +int mock_check_error(int *errorMask); +void mock_clear_xmit_prios(void); +s64 mock_cmpxchg(atomic64_t *target, s64 old, s64 new); +unsigned int mock_compound_order(struct page *page); +int mock_cpu_to_node(int core); +void mock_data_ready(struct sock *sk); +struct net_device + *mock_dev(int index, struct homa *homa); +struct dst_entry + *mock_dst_check(struct dst_entry *, __u32 cookie); +cycles_t mock_get_cycles(void); +int mock_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *settings); +unsigned int + mock_get_mtu(const struct dst_entry *dst); +void mock_get_page(struct page *page); +struct homa_net + *mock_hnet(int index, struct homa *homa); +void *mock_kmalloc(size_t size, gfp_t flags); +struct net *mock_net_for_hnet(struct homa_net *hnet); +void *mock_net_generic(const struct net *net, unsigned int id); +int mock_page_refs(struct page *page); +int mock_page_refs(struct page *page); +int mock_page_to_nid(struct page *page); +void mock_preempt_disable(void); +void mock_preempt_enable(void); +int mock_processor_id(void); +void mock_put_page(struct page *page); +struct sk_buff * + mock_raw_skb(struct in6_addr *saddr, int protocol, int length); +void mock_rcu_read_lock(void); +void mock_rcu_read_unlock(void); +void mock_record_locked(void *lock); +void mock_record_unlocked(void *lock); +struct ctl_table_header * + mock_register_net_sysctl(struct net *net, + const char *path, + struct ctl_table *table); +int mock_rht_init(struct rhashtable *ht, + const struct rhashtable_params *params); +void *mock_rht_lookup_get_insert_fast(struct rhashtable *ht, + struct rhash_head *obj, + const struct rhashtable_params params); +void *mock_rht_walk_next(struct rhashtable_iter *iter); +void mock_rpc_hold(struct homa_rpc *rpc); +void mock_rpc_put(struct homa_rpc *rpc); +void mock_set_clock_vals(u64 t, ...); +void mock_set_core(int num); +void mock_set_ipv6(struct homa_sock *hsk); +void mock_spin_lock(spinlock_t *lock); +void mock_spin_unlock(spinlock_t *lock); +struct sk_buff * + mock_skb_alloc(struct in6_addr *saddr, struct homa_common_hdr *h, + int extra_bytes, int first_value); +int mock_skb_count(void); +void mock_sock_destroy(struct homa_sock *hsk, + struct homa_socktab *socktab); +void mock_sock_hold(struct sock *sk); +int mock_sock_init(struct homa_sock *hsk, struct homa_net *hnet, + int port); +void mock_sock_put(struct sock *sk); +struct sk_buff * + mock_tcp_skb(struct in6_addr *saddr, int sequence, int extra_bytes); +void mock_teardown(void); +void *mock_vmalloc(size_t size); + +#endif /* _HOMA_MOCK_H */ diff --git a/test/rbtree.c b/test/rbtree.c new file mode 100644 index 00000000..9e730718 --- /dev/null +++ b/test/rbtree.c @@ -0,0 +1,597 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + Red Black Trees + (C) 1999 Andrea Arcangeli + (C) 2002 David Woodhouse + (C) 2012 Michel Lespinasse + + + linux/lib/rbtree.c +*/ + +#include +#include + +/* + * red-black trees properties: https://en.wikipedia.org/wiki/Rbtree + * + * 1) A node is either red or black + * 2) The root is black + * 3) All leaves (NULL) are black + * 4) Both children of every red node are black + * 5) Every simple path from root to leaves contains the same number + * of black nodes. + * + * 4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two + * consecutive red nodes in a path and every red node is therefore followed by + * a black. So if B is the number of black nodes on every simple path (as per + * 5), then the longest possible path due to 4 is 2B. + * + * We shall indicate color with case, where black nodes are uppercase and red + * nodes will be lowercase. Unknown color nodes shall be drawn as red within + * parentheses and have some accompanying text comment. + */ + +/* + * Notes on lockless lookups: + * + * All stores to the tree structure (rb_left and rb_right) must be done using + * WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the + * tree structure as seen in program order. + * + * These two requirements will allow lockless iteration of the tree -- not + * correct iteration mind you, tree rotations are not atomic so a lookup might + * miss entire subtrees. + * + * But they do guarantee that any such traversal will only see valid elements + * and that it will indeed complete -- does not get stuck in a loop. + * + * It also guarantees that if the lookup returns an element it is the 'correct' + * one. But not returning an element does _NOT_ mean it's not present. + * + * NOTE: + * + * Stores to __rb_parent_color are not important for simple lookups so those + * are left undone as of now. Nor did I check for loops involving parent + * pointers. + */ + +static inline void rb_set_black(struct rb_node *rb) +{ + rb->__rb_parent_color += RB_BLACK; +} + +static inline struct rb_node *rb_red_parent(struct rb_node *red) +{ + return (struct rb_node *)red->__rb_parent_color; +} + +/* + * Helper function for rotations: + * - old's parent and color get assigned to new + * - old gets assigned new as a parent and 'color' as a color. + */ +static inline void +__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new, + struct rb_root *root, int color) +{ + struct rb_node *parent = rb_parent(old); + new->__rb_parent_color = old->__rb_parent_color; + rb_set_parent_color(old, new, color); + __rb_change_child(old, new, parent, root); +} + +static __always_inline void +__rb_insert(struct rb_node *node, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) +{ + struct rb_node *parent = rb_red_parent(node), *gparent, *tmp; + + while (true) { + /* + * Loop invariant: node is red. + */ + if (unlikely(!parent)) { + /* + * The inserted node is root. Either this is the + * first node, or we recursed at Case 1 below and + * are no longer violating 4). + */ + rb_set_parent_color(node, NULL, RB_BLACK); + break; + } + + /* + * If there is a black parent, we are done. + * Otherwise, take some corrective action as, + * per 4), we don't want a red root or two + * consecutive red nodes. + */ + if(rb_is_black(parent)) + break; + + gparent = rb_red_parent(parent); + + tmp = gparent->rb_right; + if (parent != tmp) { /* parent == gparent->rb_left */ + if (tmp && rb_is_red(tmp)) { + /* + * Case 1 - node's uncle is red (color flips). + * + * G g + * / \ / \ + * p u --> P U + * / / + * n n + * + * However, since g's parent might be red, and + * 4) does not allow this, we need to recurse + * at g. + */ + rb_set_parent_color(tmp, gparent, RB_BLACK); + rb_set_parent_color(parent, gparent, RB_BLACK); + node = gparent; + parent = rb_parent(node); + rb_set_parent_color(node, parent, RB_RED); + continue; + } + + tmp = parent->rb_right; + if (node == tmp) { + /* + * Case 2 - node's uncle is black and node is + * the parent's right child (left rotate at parent). + * + * G G + * / \ / \ + * p U --> n U + * \ / + * n p + * + * This still leaves us in violation of 4), the + * continuation into Case 3 will fix that. + */ + tmp = node->rb_left; + WRITE_ONCE(parent->rb_right, tmp); + WRITE_ONCE(node->rb_left, parent); + if (tmp) + rb_set_parent_color(tmp, parent, + RB_BLACK); + rb_set_parent_color(parent, node, RB_RED); + augment_rotate(parent, node); + parent = node; + tmp = node->rb_right; + } + + /* + * Case 3 - node's uncle is black and node is + * the parent's left child (right rotate at gparent). + * + * G P + * / \ / \ + * p U --> n g + * / \ + * n U + */ + WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */ + WRITE_ONCE(parent->rb_right, gparent); + if (tmp) + rb_set_parent_color(tmp, gparent, RB_BLACK); + __rb_rotate_set_parents(gparent, parent, root, RB_RED); + augment_rotate(gparent, parent); + break; + } else { + tmp = gparent->rb_left; + if (tmp && rb_is_red(tmp)) { + /* Case 1 - color flips */ + rb_set_parent_color(tmp, gparent, RB_BLACK); + rb_set_parent_color(parent, gparent, RB_BLACK); + node = gparent; + parent = rb_parent(node); + rb_set_parent_color(node, parent, RB_RED); + continue; + } + + tmp = parent->rb_left; + if (node == tmp) { + /* Case 2 - right rotate at parent */ + tmp = node->rb_right; + WRITE_ONCE(parent->rb_left, tmp); + WRITE_ONCE(node->rb_right, parent); + if (tmp) + rb_set_parent_color(tmp, parent, + RB_BLACK); + rb_set_parent_color(parent, node, RB_RED); + augment_rotate(parent, node); + parent = node; + tmp = node->rb_left; + } + + /* Case 3 - left rotate at gparent */ + WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */ + WRITE_ONCE(parent->rb_left, gparent); + if (tmp) + rb_set_parent_color(tmp, gparent, RB_BLACK); + __rb_rotate_set_parents(gparent, parent, root, RB_RED); + augment_rotate(gparent, parent); + break; + } + } +} + +/* + * Inline version for rb_erase() use - we want to be able to inline + * and eliminate the dummy_rotate callback there + */ +static __always_inline void +____rb_erase_color(struct rb_node *parent, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) +{ + struct rb_node *node = NULL, *sibling, *tmp1, *tmp2; + + while (true) { + /* + * Loop invariants: + * - node is black (or NULL on first iteration) + * - node is not the root (parent is not NULL) + * - All leaf paths going through parent and node have a + * black node count that is 1 lower than other leaf paths. + */ + sibling = parent->rb_right; + if (node != sibling) { /* node == parent->rb_left */ + if (rb_is_red(sibling)) { + /* + * Case 1 - left rotate at parent + * + * P S + * / \ / \ + * N s --> p Sr + * / \ / \ + * Sl Sr N Sl + */ + tmp1 = sibling->rb_left; + WRITE_ONCE(parent->rb_right, tmp1); + WRITE_ONCE(sibling->rb_left, parent); + rb_set_parent_color(tmp1, parent, RB_BLACK); + __rb_rotate_set_parents(parent, sibling, root, + RB_RED); + augment_rotate(parent, sibling); + sibling = tmp1; + } + tmp1 = sibling->rb_right; + if (!tmp1 || rb_is_black(tmp1)) { + tmp2 = sibling->rb_left; + if (!tmp2 || rb_is_black(tmp2)) { + /* + * Case 2 - sibling color flip + * (p could be either color here) + * + * (p) (p) + * / \ / \ + * N S --> N s + * / \ / \ + * Sl Sr Sl Sr + * + * This leaves us violating 5) which + * can be fixed by flipping p to black + * if it was red, or by recursing at p. + * p is red when coming from Case 1. + */ + rb_set_parent_color(sibling, parent, + RB_RED); + if (rb_is_red(parent)) + rb_set_black(parent); + else { + node = parent; + parent = rb_parent(node); + if (parent) + continue; + } + break; + } + /* + * Case 3 - right rotate at sibling + * (p could be either color here) + * + * (p) (p) + * / \ / \ + * N S --> N sl + * / \ \ + * sl Sr S + * \ + * Sr + * + * Note: p might be red, and then both + * p and sl are red after rotation(which + * breaks property 4). This is fixed in + * Case 4 (in __rb_rotate_set_parents() + * which set sl the color of p + * and set p RB_BLACK) + * + * (p) (sl) + * / \ / \ + * N sl --> P S + * \ / \ + * S N Sr + * \ + * Sr + */ + tmp1 = tmp2->rb_right; + WRITE_ONCE(sibling->rb_left, tmp1); + WRITE_ONCE(tmp2->rb_right, sibling); + WRITE_ONCE(parent->rb_right, tmp2); + if (tmp1) + rb_set_parent_color(tmp1, sibling, + RB_BLACK); + augment_rotate(sibling, tmp2); + tmp1 = sibling; + sibling = tmp2; + } + /* + * Case 4 - left rotate at parent + color flips + * (p and sl could be either color here. + * After rotation, p becomes black, s acquires + * p's color, and sl keeps its color) + * + * (p) (s) + * / \ / \ + * N S --> P Sr + * / \ / \ + * (sl) sr N (sl) + */ + tmp2 = sibling->rb_left; + WRITE_ONCE(parent->rb_right, tmp2); + WRITE_ONCE(sibling->rb_left, parent); + rb_set_parent_color(tmp1, sibling, RB_BLACK); + if (tmp2) + rb_set_parent(tmp2, parent); + __rb_rotate_set_parents(parent, sibling, root, + RB_BLACK); + augment_rotate(parent, sibling); + break; + } else { + sibling = parent->rb_left; + if (rb_is_red(sibling)) { + /* Case 1 - right rotate at parent */ + tmp1 = sibling->rb_right; + WRITE_ONCE(parent->rb_left, tmp1); + WRITE_ONCE(sibling->rb_right, parent); + rb_set_parent_color(tmp1, parent, RB_BLACK); + __rb_rotate_set_parents(parent, sibling, root, + RB_RED); + augment_rotate(parent, sibling); + sibling = tmp1; + } + tmp1 = sibling->rb_left; + if (!tmp1 || rb_is_black(tmp1)) { + tmp2 = sibling->rb_right; + if (!tmp2 || rb_is_black(tmp2)) { + /* Case 2 - sibling color flip */ + rb_set_parent_color(sibling, parent, + RB_RED); + if (rb_is_red(parent)) + rb_set_black(parent); + else { + node = parent; + parent = rb_parent(node); + if (parent) + continue; + } + break; + } + /* Case 3 - left rotate at sibling */ + tmp1 = tmp2->rb_left; + WRITE_ONCE(sibling->rb_right, tmp1); + WRITE_ONCE(tmp2->rb_left, sibling); + WRITE_ONCE(parent->rb_left, tmp2); + if (tmp1) + rb_set_parent_color(tmp1, sibling, + RB_BLACK); + augment_rotate(sibling, tmp2); + tmp1 = sibling; + sibling = tmp2; + } + /* Case 4 - right rotate at parent + color flips */ + tmp2 = sibling->rb_right; + WRITE_ONCE(parent->rb_left, tmp2); + WRITE_ONCE(sibling->rb_right, parent); + rb_set_parent_color(tmp1, sibling, RB_BLACK); + if (tmp2) + rb_set_parent(tmp2, parent); + __rb_rotate_set_parents(parent, sibling, root, + RB_BLACK); + augment_rotate(parent, sibling); + break; + } + } +} + +/* Non-inline version for rb_erase_augmented() use */ +void __rb_erase_color(struct rb_node *parent, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) +{ + ____rb_erase_color(parent, root, augment_rotate); +} + +/* + * Non-augmented rbtree manipulation functions. + * + * We use dummy augmented callbacks here, and have the compiler optimize them + * out of the rb_insert_color() and rb_erase() function definitions. + */ + +static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {} +static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {} +static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {} + +static const struct rb_augment_callbacks dummy_callbacks = { + .propagate = dummy_propagate, + .copy = dummy_copy, + .rotate = dummy_rotate +}; + +void rb_insert_color(struct rb_node *node, struct rb_root *root) +{ + __rb_insert(node, root, dummy_rotate); +} + +void rb_erase(struct rb_node *node, struct rb_root *root) +{ + struct rb_node *rebalance; + rebalance = __rb_erase_augmented(node, root, &dummy_callbacks); + if (rebalance) + ____rb_erase_color(rebalance, root, dummy_rotate); +} + +/* + * Augmented rbtree manipulation functions. + * + * This instantiates the same __always_inline functions as in the non-augmented + * case, but this time with user-defined callbacks. + */ + +void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) +{ + __rb_insert(node, root, augment_rotate); +} + +/* + * This function returns the first node (in sort order) of the tree. + */ +struct rb_node *rb_first(const struct rb_root *root) +{ + struct rb_node *n; + + n = root->rb_node; + if (!n) + return NULL; + while (n->rb_left) + n = n->rb_left; + return n; +} + +struct rb_node *rb_last(const struct rb_root *root) +{ + struct rb_node *n; + + n = root->rb_node; + if (!n) + return NULL; + while (n->rb_right) + n = n->rb_right; + return n; +} + +struct rb_node *rb_next(const struct rb_node *node) +{ + struct rb_node *parent; + + if (RB_EMPTY_NODE(node)) + return NULL; + + /* + * If we have a right-hand child, go down and then left as far + * as we can. + */ + if (node->rb_right) { + node = node->rb_right; + while (node->rb_left) + node = node->rb_left; + return (struct rb_node *)node; + } + + /* + * No right-hand children. Everything down and left is smaller than us, + * so any 'next' node must be in the general direction of our parent. + * Go up the tree; any time the ancestor is a right-hand child of its + * parent, keep going up. First time it's a left-hand child of its + * parent, said parent is our 'next' node. + */ + while ((parent = rb_parent(node)) && node == parent->rb_right) + node = parent; + + return parent; +} + +struct rb_node *rb_prev(const struct rb_node *node) +{ + struct rb_node *parent; + + if (RB_EMPTY_NODE(node)) + return NULL; + + /* + * If we have a left-hand child, go down and then right as far + * as we can. + */ + if (node->rb_left) { + node = node->rb_left; + while (node->rb_right) + node = node->rb_right; + return (struct rb_node *)node; + } + + /* + * No left-hand children. Go up till we find an ancestor which + * is a right-hand child of its parent. + */ + while ((parent = rb_parent(node)) && node == parent->rb_left) + node = parent; + + return parent; +} + +void rb_replace_node(struct rb_node *victim, struct rb_node *new, + struct rb_root *root) +{ + struct rb_node *parent = rb_parent(victim); + + /* Copy the pointers/colour from the victim to the replacement */ + *new = *victim; + + /* Set the surrounding nodes to point to the replacement */ + if (victim->rb_left) + rb_set_parent(victim->rb_left, new); + if (victim->rb_right) + rb_set_parent(victim->rb_right, new); + __rb_change_child(victim, new, parent, root); +} + +static struct rb_node *rb_left_deepest_node(const struct rb_node *node) +{ + for (;;) { + if (node->rb_left) + node = node->rb_left; + else if (node->rb_right) + node = node->rb_right; + else + return (struct rb_node *)node; + } +} + +struct rb_node *rb_next_postorder(const struct rb_node *node) +{ + const struct rb_node *parent; + if (!node) + return NULL; + parent = rb_parent(node); + + /* If we're sitting on node, we've already seen our children */ + if (parent && node == parent->rb_left && parent->rb_right) { + /* If we are the parent's left node, go to the parent's right + * node then all the way down to the left */ + return rb_left_deepest_node(parent->rb_right); + } else + /* Otherwise we are the parent's right node, and the parent + * should be next */ + return (struct rb_node *)parent; +} + +struct rb_node *rb_first_postorder(const struct rb_root *root) +{ + if (!root->rb_node) + return NULL; + + return rb_left_deepest_node(root->rb_node); +} diff --git a/test/rhashtable.c b/test/rhashtable.c new file mode 100644 index 00000000..3e555d01 --- /dev/null +++ b/test/rhashtable.c @@ -0,0 +1,1255 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Resizable, Scalable, Concurrent Hash Table + * + * Copyright (c) 2015 Herbert Xu + * Copyright (c) 2014-2015 Thomas Graf + * Copyright (c) 2008-2014 Patrick McHardy + * + * Code partially derived from nft_hash + * Rewritten with rehash code from br_multicast plus single list + * pointer as suggested by Josh Triplett + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define HASH_DEFAULT_SIZE 64UL +#define HASH_MIN_SIZE 4U + +union nested_table { + union nested_table __rcu *table; + struct rhash_lock_head __rcu *bucket; +}; + +static u32 head_hashfn(struct rhashtable *ht, + const struct bucket_table *tbl, + const struct rhash_head *he) +{ + return rht_head_hashfn(ht, tbl, he, ht->p); +} + +#ifdef CONFIG_PROVE_LOCKING +#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT)) + +int lockdep_rht_mutex_is_held(struct rhashtable *ht) +{ + return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1; +} +EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); + +int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) +{ + if (!debug_locks) + return 1; + if (unlikely(tbl->nest)) + return 1; + return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]); +} +EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); +#else +#define ASSERT_RHT_MUTEX(HT) +#endif + +static inline union nested_table *nested_table_top( + const struct bucket_table *tbl) +{ + /* The top-level bucket entry does not need RCU protection + * because it's set at the same time as tbl->nest. + */ + return (void *)rcu_dereference_protected(tbl->buckets[0], 1); +} + +static void nested_table_free(union nested_table *ntbl, unsigned int size) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + const unsigned int len = 1 << shift; + unsigned int i; + + ntbl = rcu_dereference_protected(ntbl->table, 1); + if (!ntbl) + return; + + if (size > len) { + size >>= shift; + for (i = 0; i < len; i++) + nested_table_free(ntbl + i, size); + } + + kfree(ntbl); +} + +static void nested_bucket_table_free(const struct bucket_table *tbl) +{ + unsigned int size = tbl->size >> tbl->nest; + unsigned int len = 1 << tbl->nest; + union nested_table *ntbl; + unsigned int i; + + ntbl = nested_table_top(tbl); + + for (i = 0; i < len; i++) + nested_table_free(ntbl + i, size); + + kfree(ntbl); +} + +static void bucket_table_free(const struct bucket_table *tbl) +{ + if (tbl->nest) + nested_bucket_table_free(tbl); + + kvfree(tbl); +} + +static void bucket_table_free_rcu(struct rcu_head *head) +{ + bucket_table_free(container_of(head, struct bucket_table, rcu)); +} + +static union nested_table *nested_table_alloc(struct rhashtable *ht, + union nested_table __rcu **prev, + bool leaf) +{ + union nested_table *ntbl; + int i; + + ntbl = rcu_dereference(*prev); + if (ntbl) + return ntbl; + + ntbl = alloc_hooks_tag(ht->alloc_tag, + kmalloc_noprof(PAGE_SIZE, GFP_ATOMIC|__GFP_ZERO)); + + if (ntbl && leaf) { + for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++) + INIT_RHT_NULLS_HEAD(ntbl[i].bucket); + } + + if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL) + return ntbl; + /* Raced with another thread. */ + kfree(ntbl); + return rcu_dereference(*prev); +} + +static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht, + size_t nbuckets, + gfp_t gfp) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + struct bucket_table *tbl; + size_t size; + + if (nbuckets < (1 << (shift + 1))) + return NULL; + + size = sizeof(*tbl) + sizeof(tbl->buckets[0]); + + tbl = alloc_hooks_tag(ht->alloc_tag, + kmalloc_noprof(size, gfp|__GFP_ZERO)); + if (!tbl) + return NULL; + + if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets, + false)) { + kfree(tbl); + return NULL; + } + + tbl->nest = (ilog2(nbuckets) - 1) % shift + 1; + + return tbl; +} + +static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, + size_t nbuckets, + gfp_t gfp) +{ + struct bucket_table *tbl = NULL; + size_t size; + int i; + static struct lock_class_key __key; + + tbl = alloc_hooks_tag(ht->alloc_tag, + kvmalloc_node_noprof(struct_size(tbl, buckets, nbuckets), + gfp|__GFP_ZERO, NUMA_NO_NODE)); + + size = nbuckets; + + if (tbl == NULL && !gfpflags_allow_blocking(gfp)) { + tbl = nested_bucket_table_alloc(ht, nbuckets, gfp); + nbuckets = 0; + } + + if (tbl == NULL) + return NULL; + + lockdep_init_map(&tbl->dep_map, "rhashtable_bucket", &__key, 0); + + tbl->size = size; + + rcu_head_init(&tbl->rcu); + INIT_LIST_HEAD(&tbl->walkers); + + tbl->hash_rnd = get_random_u32(); + + for (i = 0; i < nbuckets; i++) + INIT_RHT_NULLS_HEAD(tbl->buckets[i]); + + return tbl; +} + +static struct bucket_table *rhashtable_last_table(struct rhashtable *ht, + struct bucket_table *tbl) +{ + struct bucket_table *new_tbl; + + do { + new_tbl = tbl; + tbl = rht_dereference_rcu(tbl->future_tbl, ht); + } while (tbl); + + return new_tbl; +} + +static int rhashtable_rehash_one(struct rhashtable *ht, + struct rhash_lock_head __rcu **bkt, + unsigned int old_hash) +{ + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl); + int err = -EAGAIN; + struct rhash_head *head, *next, *entry; + struct rhash_head __rcu **pprev = NULL; + unsigned int new_hash; + unsigned long flags; + + if (new_tbl->nest) + goto out; + + err = -ENOENT; + + rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash), + old_tbl, old_hash) { + err = 0; + next = rht_dereference_bucket(entry->next, old_tbl, old_hash); + + if (rht_is_a_nulls(next)) + break; + + pprev = &entry->next; + } + + if (err) + goto out; + + new_hash = head_hashfn(ht, new_tbl, entry); + + flags = rht_lock_nested(new_tbl, &new_tbl->buckets[new_hash], + SINGLE_DEPTH_NESTING); + + head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash); + + RCU_INIT_POINTER(entry->next, head); + + rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry, flags); + + if (pprev) + rcu_assign_pointer(*pprev, next); + else + /* Need to preserved the bit lock. */ + rht_assign_locked(bkt, next); + +out: + return err; +} + +static int rhashtable_rehash_chain(struct rhashtable *ht, + unsigned int old_hash) +{ + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash); + unsigned long flags; + int err; + + if (!bkt) + return 0; + flags = rht_lock(old_tbl, bkt); + + while (!(err = rhashtable_rehash_one(ht, bkt, old_hash))) + ; + + if (err == -ENOENT) + err = 0; + rht_unlock(old_tbl, bkt, flags); + + return err; +} + +static int rhashtable_rehash_attach(struct rhashtable *ht, + struct bucket_table *old_tbl, + struct bucket_table *new_tbl) +{ + /* Make insertions go into the new, empty table right away. Deletions + * and lookups will be attempted in both tables until we synchronize. + * As cmpxchg() provides strong barriers, we do not need + * rcu_assign_pointer(). + */ + + if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL, + new_tbl) != NULL) + return -EEXIST; + + return 0; +} + +static int rhashtable_rehash_table(struct rhashtable *ht) +{ + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *new_tbl; + struct rhashtable_walker *walker; + unsigned int old_hash; + int err; + + new_tbl = rht_dereference(old_tbl->future_tbl, ht); + if (!new_tbl) + return 0; + + for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { + err = rhashtable_rehash_chain(ht, old_hash); + if (err) + return err; + cond_resched(); + } + + /* Publish the new table pointer. */ + rcu_assign_pointer(ht->tbl, new_tbl); + + spin_lock(&ht->lock); + list_for_each_entry(walker, &old_tbl->walkers, list) + walker->tbl = NULL; + + /* Wait for readers. All new readers will see the new + * table, and thus no references to the old table will + * remain. + * We do this inside the locked region so that + * rhashtable_walk_stop() can use rcu_head_after_call_rcu() + * to check if it should not re-link the table. + */ + call_rcu(&old_tbl->rcu, bucket_table_free_rcu); + spin_unlock(&ht->lock); + + return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0; +} + +static int rhashtable_rehash_alloc(struct rhashtable *ht, + struct bucket_table *old_tbl, + unsigned int size) +{ + struct bucket_table *new_tbl; + int err; + + ASSERT_RHT_MUTEX(ht); + + new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); + if (new_tbl == NULL) + return -ENOMEM; + + err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); + if (err) + bucket_table_free(new_tbl); + + return err; +} + +/** + * rhashtable_shrink - Shrink hash table while allowing concurrent lookups + * @ht: the hash table to shrink + * + * This function shrinks the hash table to fit, i.e., the smallest + * size would not cause it to expand right away automatically. + * + * The caller must ensure that no concurrent resizing occurs by holding + * ht->mutex. + * + * The caller must ensure that no concurrent table mutations take place. + * It is however valid to have concurrent lookups if they are RCU protected. + * + * It is valid to have concurrent insertions and deletions protected by per + * bucket locks or concurrent RCU protected lookups and traversals. + */ +static int rhashtable_shrink(struct rhashtable *ht) +{ + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + unsigned int nelems = atomic_read(&ht->nelems); + unsigned int size = 0; + + if (nelems) + size = roundup_pow_of_two(nelems * 3 / 2); + if (size < ht->p.min_size) + size = ht->p.min_size; + + if (old_tbl->size <= size) + return 0; + + if (rht_dereference(old_tbl->future_tbl, ht)) + return -EEXIST; + + return rhashtable_rehash_alloc(ht, old_tbl, size); +} + +static void rht_deferred_worker(struct work_struct *work) +{ + struct rhashtable *ht; + struct bucket_table *tbl; + int err = 0; + + ht = container_of(work, struct rhashtable, run_work); + mutex_lock(&ht->mutex); + + tbl = rht_dereference(ht->tbl, ht); + tbl = rhashtable_last_table(ht, tbl); + + if (rht_grow_above_75(ht, tbl)) + err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2); + else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl)) + err = rhashtable_shrink(ht); + else if (tbl->nest) + err = rhashtable_rehash_alloc(ht, tbl, tbl->size); + + if (!err || err == -EEXIST) { + int nerr; + + nerr = rhashtable_rehash_table(ht); + err = err ?: nerr; + } + + mutex_unlock(&ht->mutex); + + if (err) + schedule_work(&ht->run_work); +} + +static int rhashtable_insert_rehash(struct rhashtable *ht, + struct bucket_table *tbl) +{ + struct bucket_table *old_tbl; + struct bucket_table *new_tbl; + unsigned int size; + int err; + + old_tbl = rht_dereference_rcu(ht->tbl, ht); + + size = tbl->size; + + err = -EBUSY; + + if (rht_grow_above_75(ht, tbl)) + size *= 2; + /* Do not schedule more than one rehash */ + else if (old_tbl != tbl) + goto fail; + + err = -ENOMEM; + + new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN); + if (new_tbl == NULL) + goto fail; + + err = rhashtable_rehash_attach(ht, tbl, new_tbl); + if (err) { + bucket_table_free(new_tbl); + if (err == -EEXIST) + err = 0; + } else + schedule_work(&ht->run_work); + + return err; + +fail: + /* Do not fail the insert if someone else did a rehash. */ + if (likely(rcu_access_pointer(tbl->future_tbl))) + return 0; + + /* Schedule async rehash to retry allocation in process context. */ + if (err == -ENOMEM) + schedule_work(&ht->run_work); + + return err; +} + +static void *rhashtable_lookup_one(struct rhashtable *ht, + struct rhash_lock_head __rcu **bkt, + struct bucket_table *tbl, unsigned int hash, + const void *key, struct rhash_head *obj) +{ + struct rhashtable_compare_arg arg = { + .ht = ht, + .key = key, + }; + struct rhash_head __rcu **pprev = NULL; + struct rhash_head *head; + int elasticity; + + elasticity = RHT_ELASTICITY; + rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { + struct rhlist_head *list; + struct rhlist_head *plist; + + elasticity--; + if (!key || + (ht->p.obj_cmpfn ? + ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) : + rhashtable_compare(&arg, rht_obj(ht, head)))) { + pprev = &head->next; + continue; + } + + if (!ht->rhlist) + return rht_obj(ht, head); + + list = container_of(obj, struct rhlist_head, rhead); + plist = container_of(head, struct rhlist_head, rhead); + + RCU_INIT_POINTER(list->next, plist); + head = rht_dereference_bucket(head->next, tbl, hash); + RCU_INIT_POINTER(list->rhead.next, head); + if (pprev) + rcu_assign_pointer(*pprev, obj); + else + /* Need to preserve the bit lock */ + rht_assign_locked(bkt, obj); + + return NULL; + } + + if (elasticity <= 0) + return ERR_PTR(-EAGAIN); + + return ERR_PTR(-ENOENT); +} + +static struct bucket_table *rhashtable_insert_one( + struct rhashtable *ht, struct rhash_lock_head __rcu **bkt, + struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj, + void *data) +{ + struct bucket_table *new_tbl; + struct rhash_head *head; + + if (!IS_ERR_OR_NULL(data)) + return ERR_PTR(-EEXIST); + + if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); + + new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (new_tbl) + return new_tbl; + + if (PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); + + if (unlikely(rht_grow_above_max(ht, tbl))) + return ERR_PTR(-E2BIG); + + if (unlikely(rht_grow_above_100(ht, tbl))) + return ERR_PTR(-EAGAIN); + + head = rht_ptr(bkt, tbl, hash); + + RCU_INIT_POINTER(obj->next, head); + if (ht->rhlist) { + struct rhlist_head *list; + + list = container_of(obj, struct rhlist_head, rhead); + RCU_INIT_POINTER(list->next, NULL); + } + + /* bkt is always the head of the list, so it holds + * the lock, which we need to preserve + */ + rht_assign_locked(bkt, obj); + + return NULL; +} + +static void *rhashtable_try_insert(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + struct bucket_table *new_tbl; + struct bucket_table *tbl; + struct rhash_lock_head __rcu **bkt; + unsigned long flags; + unsigned int hash; + void *data; + + new_tbl = rcu_dereference(ht->tbl); + + do { + tbl = new_tbl; + hash = rht_head_hashfn(ht, tbl, obj, ht->p); + if (rcu_access_pointer(tbl->future_tbl)) + /* Failure is OK */ + bkt = rht_bucket_var(tbl, hash); + else + bkt = rht_bucket_insert(ht, tbl, hash); + if (bkt == NULL) { + new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); + data = ERR_PTR(-EAGAIN); + } else { + bool inserted; + + flags = rht_lock(tbl, bkt); + data = rhashtable_lookup_one(ht, bkt, tbl, + hash, key, obj); + new_tbl = rhashtable_insert_one(ht, bkt, tbl, + hash, obj, data); + inserted = data && !new_tbl; + if (inserted) + atomic_inc(&ht->nelems); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + rht_unlock(tbl, bkt, flags); + + if (inserted && rht_grow_above_75(ht, tbl)) + schedule_work(&ht->run_work); + } + } while (!IS_ERR_OR_NULL(new_tbl)); + + if (PTR_ERR(data) == -EAGAIN) + data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?: + -EAGAIN); + + return data; +} + +void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + void *data; + + do { + rcu_read_lock(); + data = rhashtable_try_insert(ht, key, obj); + rcu_read_unlock(); + } while (PTR_ERR(data) == -EAGAIN); + + return data; +} +EXPORT_SYMBOL_GPL(rhashtable_insert_slow); + +/** + * rhashtable_walk_enter - Initialise an iterator + * @ht: Table to walk over + * @iter: Hash table Iterator + * + * This function prepares a hash table walk. + * + * Note that if you restart a walk after rhashtable_walk_stop you + * may see the same object twice. Also, you may miss objects if + * there are removals in between rhashtable_walk_stop and the next + * call to rhashtable_walk_start. + * + * For a completely stable walk you should construct your own data + * structure outside the hash table. + * + * This function may be called from any process context, including + * non-preemptible context, but cannot be called from softirq or + * hardirq context. + * + * You must call rhashtable_walk_exit after this function returns. + */ +void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter) +{ + iter->ht = ht; + iter->p = NULL; + iter->slot = 0; + iter->skip = 0; + iter->end_of_table = 0; + + spin_lock(&ht->lock); + iter->walker.tbl = + rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock)); + list_add(&iter->walker.list, &iter->walker.tbl->walkers); + spin_unlock(&ht->lock); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_enter); + +/** + * rhashtable_walk_exit - Free an iterator + * @iter: Hash table Iterator + * + * This function frees resources allocated by rhashtable_walk_enter. + */ +void rhashtable_walk_exit(struct rhashtable_iter *iter) +{ + spin_lock(&iter->ht->lock); + if (iter->walker.tbl) + list_del(&iter->walker.list); + spin_unlock(&iter->ht->lock); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_exit); + +/** + * rhashtable_walk_start_check - Start a hash table walk + * @iter: Hash table iterator + * + * Start a hash table walk at the current iterator position. Note that we take + * the RCU lock in all cases including when we return an error. So you must + * always call rhashtable_walk_stop to clean up. + * + * Returns zero if successful. + * + * Returns -EAGAIN if resize event occurred. Note that the iterator + * will rewind back to the beginning and you may use it immediately + * by calling rhashtable_walk_next. + * + * rhashtable_walk_start is defined as an inline variant that returns + * void. This is preferred in cases where the caller would ignore + * resize events and always continue. + */ +int rhashtable_walk_start_check(struct rhashtable_iter *iter) + __acquires(RCU) +{ + struct rhashtable *ht = iter->ht; + bool rhlist = ht->rhlist; + + rcu_read_lock(); + + spin_lock(&ht->lock); + if (iter->walker.tbl) + list_del(&iter->walker.list); + spin_unlock(&ht->lock); + + if (iter->end_of_table) + return 0; + if (!iter->walker.tbl) { + iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht); + iter->slot = 0; + iter->skip = 0; + return -EAGAIN; + } + + if (iter->p && !rhlist) { + /* + * We need to validate that 'p' is still in the table, and + * if so, update 'skip' + */ + struct rhash_head *p; + int skip = 0; + rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { + skip++; + if (p == iter->p) { + iter->skip = skip; + goto found; + } + } + iter->p = NULL; + } else if (iter->p && rhlist) { + /* Need to validate that 'list' is still in the table, and + * if so, update 'skip' and 'p'. + */ + struct rhash_head *p; + struct rhlist_head *list; + int skip = 0; + rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { + for (list = container_of(p, struct rhlist_head, rhead); + list; + list = rcu_dereference(list->next)) { + skip++; + if (list == iter->list) { + iter->p = p; + iter->skip = skip; + goto found; + } + } + } + iter->p = NULL; + } +found: + return 0; +} +EXPORT_SYMBOL_GPL(rhashtable_walk_start_check); + +/** + * __rhashtable_walk_find_next - Find the next element in a table (or the first + * one in case of a new walk). + * + * @iter: Hash table iterator + * + * Returns the found object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occurred. + */ +static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter) +{ + struct bucket_table *tbl = iter->walker.tbl; + struct rhlist_head *list = iter->list; + struct rhashtable *ht = iter->ht; + struct rhash_head *p = iter->p; + bool rhlist = ht->rhlist; + + if (!tbl) + return NULL; + + for (; iter->slot < tbl->size; iter->slot++) { + int skip = iter->skip; + + rht_for_each_rcu(p, tbl, iter->slot) { + if (rhlist) { + list = container_of(p, struct rhlist_head, + rhead); + do { + if (!skip) + goto next; + skip--; + list = rcu_dereference(list->next); + } while (list); + + continue; + } + if (!skip) + break; + skip--; + } + +next: + if (!rht_is_a_nulls(p)) { + iter->skip++; + iter->p = p; + iter->list = list; + return rht_obj(ht, rhlist ? &list->rhead : p); + } + + iter->skip = 0; + } + + iter->p = NULL; + + /* Ensure we see any new tables. */ + smp_rmb(); + + iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (iter->walker.tbl) { + iter->slot = 0; + iter->skip = 0; + return ERR_PTR(-EAGAIN); + } else { + iter->end_of_table = true; + } + + return NULL; +} + +/** + * rhashtable_walk_next - Return the next object and advance the iterator + * @iter: Hash table iterator + * + * Note that you must call rhashtable_walk_stop when you are finished + * with the walk. + * + * Returns the next object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occurred. Note that the iterator + * will rewind back to the beginning and you may continue to use it. + */ +void *rhashtable_walk_next(struct rhashtable_iter *iter) +{ + struct rhlist_head *list = iter->list; + struct rhashtable *ht = iter->ht; + struct rhash_head *p = iter->p; + bool rhlist = ht->rhlist; + + if (p) { + if (!rhlist || !(list = rcu_dereference(list->next))) { + p = rcu_dereference(p->next); + list = container_of(p, struct rhlist_head, rhead); + } + if (!rht_is_a_nulls(p)) { + iter->skip++; + iter->p = p; + iter->list = list; + return rht_obj(ht, rhlist ? &list->rhead : p); + } + + /* At the end of this slot, switch to next one and then find + * next entry from that point. + */ + iter->skip = 0; + iter->slot++; + } + + return __rhashtable_walk_find_next(iter); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_next); + +/** + * rhashtable_walk_peek - Return the next object but don't advance the iterator + * @iter: Hash table iterator + * + * Returns the next object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occurred. Note that the iterator + * will rewind back to the beginning and you may continue to use it. + */ +void *rhashtable_walk_peek(struct rhashtable_iter *iter) +{ + struct rhlist_head *list = iter->list; + struct rhashtable *ht = iter->ht; + struct rhash_head *p = iter->p; + + if (p) + return rht_obj(ht, ht->rhlist ? &list->rhead : p); + + /* No object found in current iter, find next one in the table. */ + + if (iter->skip) { + /* A nonzero skip value points to the next entry in the table + * beyond that last one that was found. Decrement skip so + * we find the current value. __rhashtable_walk_find_next + * will restore the original value of skip assuming that + * the table hasn't changed. + */ + iter->skip--; + } + + return __rhashtable_walk_find_next(iter); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_peek); + +/** + * rhashtable_walk_stop - Finish a hash table walk + * @iter: Hash table iterator + * + * Finish a hash table walk. Does not reset the iterator to the start of the + * hash table. + */ +void rhashtable_walk_stop(struct rhashtable_iter *iter) + __releases(RCU) +{ + struct rhashtable *ht; + struct bucket_table *tbl = iter->walker.tbl; + + if (!tbl) + goto out; + + ht = iter->ht; + + spin_lock(&ht->lock); + if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu)) + /* This bucket table is being freed, don't re-link it. */ + iter->walker.tbl = NULL; + else + list_add(&iter->walker.list, &tbl->walkers); + spin_unlock(&ht->lock); + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_stop); + +static size_t rounded_hashtable_size(const struct rhashtable_params *params) +{ + size_t retsize; + + if (params->nelem_hint) + retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3), + (unsigned long)params->min_size); + else + retsize = max(HASH_DEFAULT_SIZE, + (unsigned long)params->min_size); + + return retsize; +} + +static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) +{ + return jhash2(key, length, seed); +} + +/** + * rhashtable_init - initialize a new hash table + * @ht: hash table to be initialized + * @params: configuration parameters + * + * Initializes a new hash table based on the provided configuration + * parameters. A table can be configured either with a variable or + * fixed length key: + * + * Configuration Example 1: Fixed length keys + * struct test_obj { + * int key; + * void * my_member; + * struct rhash_head node; + * }; + * + * struct rhashtable_params params = { + * .head_offset = offsetof(struct test_obj, node), + * .key_offset = offsetof(struct test_obj, key), + * .key_len = sizeof(int), + * .hashfn = jhash, + * }; + * + * Configuration Example 2: Variable length keys + * struct test_obj { + * [...] + * struct rhash_head node; + * }; + * + * u32 my_hash_fn(const void *data, u32 len, u32 seed) + * { + * struct test_obj *obj = data; + * + * return [... hash ...]; + * } + * + * struct rhashtable_params params = { + * .head_offset = offsetof(struct test_obj, node), + * .hashfn = jhash, + * .obj_hashfn = my_hash_fn, + * }; + */ +int rhashtable_init_noprof(struct rhashtable *ht, + const struct rhashtable_params *params) +{ + struct bucket_table *tbl; + size_t size; + + if ((!params->key_len && !params->obj_hashfn) || + (params->obj_hashfn && !params->obj_cmpfn)) + return -EINVAL; + + memset(ht, 0, sizeof(*ht)); + mutex_init(&ht->mutex); + spin_lock_init(&ht->lock); + memcpy(&ht->p, params, sizeof(*params)); + + alloc_tag_record(ht->alloc_tag); + + if (params->min_size) + ht->p.min_size = roundup_pow_of_two(params->min_size); + + /* Cap total entries at 2^31 to avoid nelems overflow. */ + ht->max_elems = 1u << 31; + + if (params->max_size) { + ht->p.max_size = rounddown_pow_of_two(params->max_size); + if (ht->p.max_size < ht->max_elems / 2) + ht->max_elems = ht->p.max_size * 2; + } + + ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); + + size = rounded_hashtable_size(&ht->p); + + ht->key_len = ht->p.key_len; + if (!params->hashfn) { + ht->p.hashfn = jhash; + + if (!(ht->key_len & (sizeof(u32) - 1))) { + ht->key_len /= sizeof(u32); + ht->p.hashfn = rhashtable_jhash2; + } + } + + /* + * This is api initialization and thus we need to guarantee the + * initial rhashtable allocation. Upon failure, retry with the + * smallest possible size with __GFP_NOFAIL semantics. + */ + tbl = bucket_table_alloc(ht, size, GFP_KERNEL); + if (unlikely(tbl == NULL)) { + size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); + tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL); + } + + atomic_set(&ht->nelems, 0); + + RCU_INIT_POINTER(ht->tbl, tbl); + + INIT_WORK(&ht->run_work, rht_deferred_worker); + + return 0; +} +EXPORT_SYMBOL_GPL(rhashtable_init_noprof); + +/** + * rhltable_init - initialize a new hash list table + * @hlt: hash list table to be initialized + * @params: configuration parameters + * + * Initializes a new hash list table. + * + * See documentation for rhashtable_init. + */ +int rhltable_init_noprof(struct rhltable *hlt, const struct rhashtable_params *params) +{ + int err; + + err = rhashtable_init_noprof(&hlt->ht, params); + hlt->ht.rhlist = true; + return err; +} +EXPORT_SYMBOL_GPL(rhltable_init_noprof); + +static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, + void (*free_fn)(void *ptr, void *arg), + void *arg) +{ + struct rhlist_head *list; + + if (!ht->rhlist) { + free_fn(rht_obj(ht, obj), arg); + return; + } + + list = container_of(obj, struct rhlist_head, rhead); + do { + obj = &list->rhead; + list = rht_dereference(list->next, ht); + free_fn(rht_obj(ht, obj), arg); + } while (list); +} + +/** + * rhashtable_free_and_destroy - free elements and destroy hash table + * @ht: the hash table to destroy + * @free_fn: callback to release resources of element + * @arg: pointer passed to free_fn + * + * Stops an eventual async resize. If defined, invokes free_fn for each + * element to releasal resources. Please note that RCU protected + * readers may still be accessing the elements. Releasing of resources + * must occur in a compatible manner. Then frees the bucket array. + * + * This function will eventually sleep to wait for an async resize + * to complete. The caller is responsible that no further write operations + * occurs in parallel. + */ +void rhashtable_free_and_destroy(struct rhashtable *ht, + void (*free_fn)(void *ptr, void *arg), + void *arg) +{ + struct bucket_table *tbl, *next_tbl; + unsigned int i; + + cancel_work_sync(&ht->run_work); + + mutex_lock(&ht->mutex); + tbl = rht_dereference(ht->tbl, ht); +restart: + if (free_fn) { + for (i = 0; i < tbl->size; i++) { + struct rhash_head *pos, *next; + + cond_resched(); + for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)), + next = !rht_is_a_nulls(pos) ? + rht_dereference(pos->next, ht) : NULL; + !rht_is_a_nulls(pos); + pos = next, + next = !rht_is_a_nulls(pos) ? + rht_dereference(pos->next, ht) : NULL) + rhashtable_free_one(ht, pos, free_fn, arg); + } + } + + next_tbl = rht_dereference(tbl->future_tbl, ht); + bucket_table_free(tbl); + if (next_tbl) { + tbl = next_tbl; + goto restart; + } + mutex_unlock(&ht->mutex); +} +EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy); + +void rhashtable_destroy(struct rhashtable *ht) +{ + return rhashtable_free_and_destroy(ht, NULL, NULL); +} +EXPORT_SYMBOL_GPL(rhashtable_destroy); + +struct rhash_lock_head __rcu **__rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + unsigned int index = hash & ((1 << tbl->nest) - 1); + unsigned int size = tbl->size >> tbl->nest; + unsigned int subhash = hash; + union nested_table *ntbl; + + ntbl = nested_table_top(tbl); + ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash); + subhash >>= tbl->nest; + + while (ntbl && size > (1 << shift)) { + index = subhash & ((1 << shift) - 1); + ntbl = rht_dereference_bucket_rcu(ntbl[index].table, + tbl, hash); + size >>= shift; + subhash >>= shift; + } + + if (!ntbl) + return NULL; + + return &ntbl[subhash].bucket; + +} +EXPORT_SYMBOL_GPL(__rht_bucket_nested); + +struct rhash_lock_head __rcu **rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash) +{ + static struct rhash_lock_head __rcu *rhnull; + + if (!rhnull) + INIT_RHT_NULLS_HEAD(rhnull); + return __rht_bucket_nested(tbl, hash) ?: &rhnull; +} +EXPORT_SYMBOL_GPL(rht_bucket_nested); + +struct rhash_lock_head __rcu **rht_bucket_nested_insert( + struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + unsigned int index = hash & ((1 << tbl->nest) - 1); + unsigned int size = tbl->size >> tbl->nest; + union nested_table *ntbl; + + ntbl = nested_table_top(tbl); + hash >>= tbl->nest; + ntbl = nested_table_alloc(ht, &ntbl[index].table, + size <= (1 << shift)); + + while (ntbl && size > (1 << shift)) { + index = hash & ((1 << shift) - 1); + size >>= shift; + hash >>= shift; + ntbl = nested_table_alloc(ht, &ntbl[index].table, + size <= (1 << shift)); + } + + if (!ntbl) + return NULL; + + return &ntbl[hash].bucket; + +} +EXPORT_SYMBOL_GPL(rht_bucket_nested_insert); diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c new file mode 100644 index 00000000..150edd99 --- /dev/null +++ b/test/unit_homa_grant.c @@ -0,0 +1,1930 @@ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +#include "homa_impl.h" +#include "homa_grant.h" +#include "homa_pacer.h" +#include "homa_peer.h" +#include "homa_pool.h" +#include "homa_rpc.h" +#define KSELFTEST_NOT_MAIN 1 +#include "kselftest_harness.h" +#include "ccutils.h" +#include "mock.h" +#include "utils.h" + +char *rpc_ids(struct homa_rpc **rpcs, int count) +{ + static char buffer[1000]; + size_t length = 0; + int i; + + for (i = 0; i < count; i++) { + if (length != 0) + length += snprintf(buffer + length, + sizeof(buffer) - length, " "); + length += snprintf(buffer + length, sizeof(buffer) - length, + "%lld", rpcs[i]->id); + } + return buffer; +} + +static int hook_spinlock_count; +static void grant_spinlock_hook(char *id) +{ + if (strcmp(id, "spin_lock") != 0) + return; + mock_clock = 1000; + hook_spinlock_count++; +} + +static struct homa_grant *hook_grant; +static void grant_check_stalled_hook(char *id) +{ + if (strcmp(id, "grant_check_needy") != 0) + return; + atomic_dec(&hook_grant->stalled_rank); +} + +static struct homa_rpc *hook_end_rpc; +static int hook_end_lock_count; +static void grant_spinlock_end_hook(char *id) +{ + if (strcmp(id, "spin_lock") != 0) + return; + if (hook_end_lock_count > 0) { + hook_end_lock_count--; + if (hook_end_lock_count == 0) + homa_rpc_end(hook_end_rpc); + } +} + +FIXTURE(homa_grant) { + struct in6_addr client_ip[5]; + int client_port; + struct in6_addr server_ip[5]; + int server_port; + u64 client_id; + u64 server_id; + union sockaddr_in_union server_addr; + struct homa homa; + struct homa_net *hnet; + struct homa_sock hsk; + struct homa_data_hdr data; + int incoming_delta; + struct homa_grant_candidates cand; +}; +FIXTURE_SETUP(homa_grant) +{ + self->client_ip[0] = unit_get_in_addr("196.168.0.1"); + self->client_ip[1] = unit_get_in_addr("197.168.0.1"); + self->client_ip[2] = unit_get_in_addr("198.168.0.1"); + self->client_ip[3] = unit_get_in_addr("199.168.0.1"); + self->client_ip[4] = unit_get_in_addr("200.168.0.1"); + self->client_port = 40000; + self->server_ip[0] = unit_get_in_addr("1.2.3.4"); + self->server_ip[1] = unit_get_in_addr("2.2.3.4"); + self->server_ip[2] = unit_get_in_addr("3.2.3.4"); + self->server_ip[3] = unit_get_in_addr("4.2.3.4"); + self->server_ip[4] = unit_get_in_addr("5.2.3.4"); + self->server_port = 99; + self->client_id = 1234; + self->server_id = 1235; + homa_init(&self->homa); + self->hnet = mock_hnet(0, &self->homa); + self->homa.num_priorities = 1; + self->homa.poll_cycles = 0; + self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; + self->homa.qshared->fifo_fraction = 0; + self->homa.grant->fifo_fraction = 0; + self->homa.grant->window = 10000; + self->homa.grant->max_incoming = 50000; + self->homa.grant->max_rpcs_per_peer = 10; + mock_sock_init(&self->hsk, self->hnet, 0); + self->server_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; + self->server_addr.in6.sin6_addr = self->server_ip[0]; + self->server_addr.in6.sin6_port = htons(self->server_port); + memset(&self->data, 0, sizeof(self->data)); + self->data.common = (struct homa_common_hdr){ + .sport = htons(self->client_port), + .dport = htons(self->server_port), + .type = DATA, + .sender_id = cpu_to_be64(self->client_id) + }; + self->data.message_length = htonl(10000); + self->data.incoming = htonl(10000); + unit_log_clear(); + self->incoming_delta = 0; + homa_grant_cand_init(&self->cand); +} +FIXTURE_TEARDOWN(homa_grant) +{ + homa_grant_cand_check(&self->cand, self->homa.grant); + homa_destroy(&self->homa); + unit_teardown(); +} + +/* Create a client RPC whose msgin is mostly initialized, except + * homa_grant_init_rpc isn't invoked. + */ +static struct homa_rpc *test_rpc(FIXTURE_DATA(homa_grant) *self, + u64 id, struct in6_addr *server_ip, int size) +{ + struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, server_ip, self->server_port, + id, 1000, size); + + rpc->msgin.length = size; + skb_queue_head_init(&rpc->msgin.packets); + INIT_LIST_HEAD(&rpc->msgin.gaps); + rpc->msgin.bytes_remaining = size; + rpc->msgin.rank = -1; + rpc->msgin.granted = 1000; + return rpc; +} + +/* Create a client RPC whose msgin is properly initialized with no + * unscheduled bytes and no packets received. + */ +static struct homa_rpc *test_rpc_init(FIXTURE_DATA(homa_grant) *self, + u64 id, struct in6_addr *server_ip, int size) +{ + struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, server_ip, self->server_port, + id, 1000, size); + homa_message_in_init(rpc, size, 0); + return rpc; +} + +TEST_F(homa_grant, homa_grant_alloc__success) +{ + struct homa_grant *grant; + + grant = homa_grant_alloc(&self->homa); + EXPECT_EQ(50, grant->fifo_fraction); + homa_grant_free(grant); +} +TEST_F(homa_grant, homa_grant_alloc__cant_allocate_memory) +{ + struct homa_grant *grant; + + mock_kmalloc_errors = 1; + grant = homa_grant_alloc(&self->homa); + EXPECT_TRUE(IS_ERR(grant)); + EXPECT_EQ(ENOMEM, -PTR_ERR(grant)); +} +TEST_F(homa_grant, homa_grant_alloc__cant_register_sysctls) +{ + struct homa_grant *grant; + + mock_register_sysctl_errors = 1; + grant = homa_grant_alloc(&self->homa); + EXPECT_TRUE(IS_ERR(grant)); + EXPECT_EQ(ENOMEM, -PTR_ERR(grant)); +} + +TEST_F(homa_grant, homa_grant_free__basics) +{ + struct homa_grant *grant; + + grant = homa_grant_alloc(&self->homa); + homa_grant_free(grant); + EXPECT_STREQ("unregister_net_sysctl_table", unit_log_get()); +} +TEST_F(homa_grant, homa_grant_free__sysctls_not_registered) +{ + struct homa_grant *grant; + + grant = homa_grant_alloc(&self->homa); + grant->sysctl_header = NULL; + homa_grant_free(grant); + EXPECT_STREQ("", unit_log_get()); +} + +TEST_F(homa_grant, homa_grant_init_rpc__grants_not_needed) +{ + struct homa_rpc *rpc; + + rpc= unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 100, 1000, + 20000); + homa_message_in_init(rpc, 2000, 2000); + EXPECT_EQ(-1, rpc->msgin.rank); + EXPECT_EQ(2000, rpc->msgin.granted); +} +TEST_F(homa_grant, homa_grant_init_rpc__grants_needed) +{ + struct homa_rpc *rpc; + + rpc= unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 100, 1000, + 20000); + + homa_message_in_init(rpc, 5000, 2000); + EXPECT_EQ(0, rpc->msgin.rank); + EXPECT_EQ(2000, rpc->msgin.granted); +} +TEST_F(homa_grant, homa_grant_init_rpc__no_bpages_available) +{ + struct homa_rpc *rpc; + + rpc= unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 100, 1000, + 20000); + + atomic_set(&self->hsk.buffer_pool->free_bpages, 0); + homa_message_in_init(rpc, 20000, 10000); + EXPECT_EQ(0, rpc->msgin.num_bpages); + EXPECT_EQ(-1, rpc->msgin.rank); + EXPECT_EQ(10000, rpc->msgin.granted); +} + +TEST_F(homa_grant, homa_grant_end_rpc__basics) +{ + struct homa_rpc *rpc; + + rpc = test_rpc_init(self, 100, self->server_ip, 20000); + rpc->msgin.rec_incoming = 100; + EXPECT_EQ(0, rpc->msgin.rank); + + unit_hook_register(grant_spinlock_hook); + hook_spinlock_count = 0; + + homa_grant_end_rpc(rpc); + EXPECT_EQ(-1, rpc->msgin.rank); + EXPECT_EQ(1, hook_spinlock_count); + EXPECT_EQ(-100, atomic_read(&self->homa.grant->total_incoming)); + EXPECT_EQ(0, rpc->msgin.rec_incoming); +} +TEST_F(homa_grant, homa_grant_end_rpc__skip_cleanup_if_fully_granted) +{ + struct homa_rpc *rpc; + + rpc = test_rpc_init(self, 100, self->server_ip, 20000); + rpc->msgin.rec_incoming = 100; + rpc->msgin.granted = rpc->msgin.length; + EXPECT_EQ(0, rpc->msgin.rank); + + homa_grant_end_rpc(rpc); + EXPECT_EQ(0, rpc->msgin.rank); + EXPECT_EQ(-100, atomic_read(&self->homa.grant->total_incoming)); + EXPECT_EQ(0, rpc->msgin.rec_incoming); +} +TEST_F(homa_grant, homa_grant_end_rpc__activate_other_rpc) +{ + struct homa_rpc *rpc1, *rpc2; + + self->homa.grant->max_rpcs_per_peer = 1; + rpc1 = test_rpc_init(self, 100, self->server_ip, 20000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 30000); + EXPECT_EQ(0, rpc1->msgin.rank); + EXPECT_EQ(-1, rpc2->msgin.rank); + + unit_hook_register(grant_spinlock_hook); + hook_spinlock_count = 0; + + unit_log_clear(); + homa_rpc_lock(rpc1); + homa_grant_end_rpc(rpc1); + homa_rpc_unlock(rpc1); + EXPECT_EQ(-1, rpc1->msgin.rank); + EXPECT_EQ(0, rpc2->msgin.rank); + EXPECT_EQ(4, hook_spinlock_count); + EXPECT_STREQ("xmit GRANT 10000@0", unit_log_get()); +} + +TEST_F(homa_grant, homa_grant_window) +{ + /* Static grant window. */ + self->homa.grant->window_param = 5000; + EXPECT_EQ(5000, homa_grant_window(self->homa.grant)); + + /* Dynamic grant window. */ + self->homa.grant->window_param = 0; + self->homa.grant->max_incoming = 100000; + self->homa.grant->num_active_rpcs = 4; + EXPECT_EQ(20000, homa_grant_window(self->homa.grant)); +} + +TEST_F(homa_grant, homa_grant_outranks) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + + rpc1 = test_rpc(self, 100, self->server_ip, 20000); + rpc1->msgin.birth = 3000; + rpc2 = test_rpc(self, 102, self->server_ip, 30000); + rpc2->msgin.birth = 2000; + rpc3 = test_rpc(self, 104, self->server_ip, 30000); + rpc3->msgin.birth = 1999; + rpc4 = test_rpc(self, 106, self->server_ip, 30000); + rpc4->msgin.birth = 2000; + + EXPECT_EQ(1, homa_grant_outranks(rpc1, rpc2)); + EXPECT_EQ(0, homa_grant_outranks(rpc2, rpc1)); + EXPECT_EQ(0, homa_grant_outranks(rpc2, rpc3)); + EXPECT_EQ(1, homa_grant_outranks(rpc3, rpc2)); + EXPECT_EQ(0, homa_grant_outranks(rpc2, rpc4)); + EXPECT_EQ(0, homa_grant_outranks(rpc4, rpc2)); +} + +TEST_F(homa_grant, homa_grant_priority__no_extra_levels) +{ + self->homa.max_sched_prio = 6; + self->homa.grant->num_active_rpcs = 7; + EXPECT_EQ(6, homa_grant_priority(&self->homa, 0)); + EXPECT_EQ(0, homa_grant_priority(&self->homa, 7)); +} +TEST_F(homa_grant, homa_grant_priority__extra_levels) +{ + self->homa.max_sched_prio = 6; + self->homa.grant->num_active_rpcs = 4; + EXPECT_EQ(3, homa_grant_priority(&self->homa, 0)); + EXPECT_EQ(0, homa_grant_priority(&self->homa, 7)); +} + +TEST_F(homa_grant, homa_grant_insert_active__basics) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3; + + rpc1 = test_rpc(self, 100, self->server_ip, 30000); + rpc2 = test_rpc(self, 102, self->server_ip, 20000); + rpc3 = test_rpc(self, 104, self->server_ip, 30000); + + EXPECT_EQ(NULL, homa_grant_insert_active(rpc1)); + EXPECT_EQ(0, rpc1->msgin.rank); + + EXPECT_EQ(NULL, homa_grant_insert_active(rpc2)); + EXPECT_EQ(1, rpc1->msgin.rank); + EXPECT_EQ(0, rpc2->msgin.rank); + + EXPECT_EQ(NULL, homa_grant_insert_active(rpc3)); + EXPECT_EQ(1, rpc1->msgin.rank); + EXPECT_EQ(0, rpc2->msgin.rank); + EXPECT_EQ(2, rpc3->msgin.rank); + EXPECT_EQ(3, rpc1->peer->active_rpcs); +} +TEST_F(homa_grant, homa_grant_insert_active__too_many_from_same_peer) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + + rpc1 = test_rpc(self, 100, self->server_ip, 10000); + rpc2 = test_rpc(self, 102, self->server_ip, 20000); + rpc3 = test_rpc(self, 104, &self->server_ip[1], 30000); + rpc4 = test_rpc(self, 106, self->server_ip, 25000); + + self->homa.grant->max_rpcs_per_peer = 2; + EXPECT_EQ(NULL, homa_grant_insert_active(rpc1)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc2)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc3)); + EXPECT_EQ(rpc4, homa_grant_insert_active(rpc4)); + EXPECT_EQ(0, rpc1->msgin.rank); + EXPECT_EQ(1, rpc2->msgin.rank); + EXPECT_EQ(2, rpc3->msgin.rank); + EXPECT_EQ(-1, rpc4->msgin.rank); + EXPECT_EQ(2, rpc1->peer->active_rpcs); +} +TEST_F(homa_grant, homa_grant_insert_active__bump_rpc_from_same_peer) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + + rpc1 = test_rpc(self, 100, self->server_ip, 10000); + rpc2 = test_rpc(self, 102, &self->server_ip[1], 20000); + rpc3 = test_rpc(self, 104, self->server_ip, 30000); + rpc4 = test_rpc(self, 106, self->server_ip, 5000); + + self->homa.grant->max_rpcs_per_peer = 2; + EXPECT_EQ(NULL, homa_grant_insert_active(rpc1)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc2)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc3)); + EXPECT_EQ(rpc3, homa_grant_insert_active(rpc4)); + EXPECT_EQ(1, rpc1->msgin.rank); + EXPECT_EQ(2, rpc2->msgin.rank); + EXPECT_EQ(-1, rpc3->msgin.rank); + EXPECT_EQ(0, rpc4->msgin.rank); + EXPECT_EQ(2, rpc1->peer->active_rpcs); + EXPECT_EQ(3, self->homa.grant->num_active_rpcs); +} +TEST_F(homa_grant, homa_grant_insert_active__no_room_for_new_rpc) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + + rpc1 = test_rpc(self, 100, self->server_ip, 10000); + rpc2 = test_rpc(self, 102, self->server_ip, 20000); + rpc3 = test_rpc(self, 104, self->server_ip, 30000); + rpc4 = test_rpc(self, 106, self->server_ip, 30000); + + self->homa.grant->max_overcommit = 3; + EXPECT_EQ(NULL, homa_grant_insert_active(rpc1)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc2)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc3)); + EXPECT_EQ(rpc4, homa_grant_insert_active(rpc4)); + EXPECT_EQ(0, rpc1->msgin.rank); + EXPECT_EQ(1, rpc2->msgin.rank); + EXPECT_EQ(2, rpc3->msgin.rank); + EXPECT_EQ(-1, rpc4->msgin.rank); + EXPECT_EQ(3, self->homa.grant->num_active_rpcs); +} +TEST_F(homa_grant, homa_grant_insert_active__insert_in_middle_and_bump) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + + rpc1 = test_rpc(self, 100, self->server_ip, 10000); + rpc2 = test_rpc(self, 102, self->server_ip, 20000); + rpc3 = test_rpc(self, 104, self->server_ip, 30000); + rpc4 = test_rpc(self, 106, self->server_ip, 15000); + + self->homa.grant->max_overcommit = 3; + EXPECT_EQ(NULL, homa_grant_insert_active(rpc1)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc2)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc3)); + EXPECT_EQ(rpc3, homa_grant_insert_active(rpc4)); + EXPECT_EQ(0, rpc1->msgin.rank); + EXPECT_EQ(2, rpc2->msgin.rank); + EXPECT_EQ(-1, rpc3->msgin.rank); + EXPECT_EQ(1, rpc4->msgin.rank); + EXPECT_EQ(3, self->homa.grant->num_active_rpcs); + EXPECT_EQ(rpc4, self->homa.grant->active_rpcs[1]); + EXPECT_EQ(rpc2, self->homa.grant->active_rpcs[2]); + EXPECT_EQ(3, rpc1->peer->active_rpcs); +} +TEST_F(homa_grant, homa_grant_insert_active__insert_in_middle_no_bump) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + + rpc1 = test_rpc(self, 100, self->server_ip, 10000); + rpc2 = test_rpc(self, 102, self->server_ip, 20000); + rpc3 = test_rpc(self, 104, self->server_ip, 30000); + rpc4 = test_rpc(self, 106, self->server_ip, 15000); + + EXPECT_EQ(NULL, homa_grant_insert_active(rpc1)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc2)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc3)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc4)); + EXPECT_EQ(0, rpc1->msgin.rank); + EXPECT_EQ(2, rpc2->msgin.rank); + EXPECT_EQ(3, rpc3->msgin.rank); + EXPECT_EQ(1, rpc4->msgin.rank); + EXPECT_EQ(4, self->homa.grant->num_active_rpcs); + EXPECT_EQ(rpc4, self->homa.grant->active_rpcs[1]); + EXPECT_EQ(rpc2, self->homa.grant->active_rpcs[2]); + EXPECT_EQ(rpc3, self->homa.grant->active_rpcs[3]); + EXPECT_EQ(4, rpc1->peer->active_rpcs); +} + +TEST_F(homa_grant, homa_grant_adjust_peer__remove_peer_from_grantable_peers) +{ + struct homa_rpc *rpc = test_rpc(self, 200, self->server_ip, 100000); + struct homa_peer *peer = rpc->peer; + + list_add_tail(&peer->grantable_links, + &self->homa.grant->grantable_peers); + EXPECT_EQ(1, list_empty(&peer->grantable_rpcs)); + EXPECT_EQ(0, list_empty(&peer->grantable_links)); + EXPECT_EQ(0, list_empty(&self->homa.grant->grantable_peers)); + + homa_grant_adjust_peer(self->homa.grant, peer); + EXPECT_EQ(1, list_empty(&peer->grantable_links)); + EXPECT_EQ(1, list_empty(&self->homa.grant->grantable_peers)); +} +TEST_F(homa_grant, homa_grant_adjust_peer__insert_in_grantable_peers) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 70000); + + homa_grant_insert_grantable(test_rpc(self, 200, self->server_ip + 1, + 100000)); + homa_grant_insert_grantable(test_rpc(self, 300, self->server_ip + 2, + 50000)); + list_add_tail(&rpc->grantable_links, &rpc->peer->grantable_rpcs); + homa_grant_adjust_peer(self->homa.grant, rpc->peer); + + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 3.2.3.4: id 300 ungranted 49000; " + "peer 1.2.3.4: id 100 ungranted 69000; " + "peer 2.2.3.4: id 200 ungranted 99000", + unit_log_get()); +} +TEST_F(homa_grant, homa_grant_adjust_peer__append_to_grantable_peers) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 120000); + + homa_grant_insert_grantable(test_rpc(self, 200, self->server_ip + 1, + 100000)); + homa_grant_insert_grantable(test_rpc(self, 300, self->server_ip + 2, + 50000)); + list_add_tail(&rpc->grantable_links, &rpc->peer->grantable_rpcs); + homa_grant_adjust_peer(self->homa.grant, rpc->peer); + + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 3.2.3.4: id 300 ungranted 49000; " + "peer 2.2.3.4: id 200 ungranted 99000; " + "peer 1.2.3.4: id 100 ungranted 119000", + unit_log_get()); +} +TEST_F(homa_grant, homa_grant_adjust_peer__move_peer_upwards) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 120000); + + homa_grant_insert_grantable(rpc); + homa_grant_insert_grantable(test_rpc(self, 200, self->server_ip + 1, + 100000)); + homa_grant_insert_grantable(test_rpc(self, 300, self->server_ip + 2, + 50000)); + homa_grant_insert_grantable(test_rpc(self, 400, self->server_ip + 3, + 80000)); + rpc->msgin.granted += 45000; + homa_grant_adjust_peer(self->homa.grant, rpc->peer); + + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 3.2.3.4: id 300 ungranted 49000; " + "peer 1.2.3.4: id 100 ungranted 74000; " + "peer 4.2.3.4: id 400 ungranted 79000; " + "peer 2.2.3.4: id 200 ungranted 99000", + unit_log_get()); +} +TEST_F(homa_grant, homa_grant_adjust_peer__move_peer_to_front) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 100000); + + homa_grant_insert_grantable(rpc); + homa_grant_insert_grantable(test_rpc(self, 200, self->server_ip + 1, + 50000)); + rpc->msgin.granted += 55000; + homa_grant_adjust_peer(self->homa.grant, rpc->peer); + + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 1.2.3.4: id 100 ungranted 44000; " + "peer 2.2.3.4: id 200 ungranted 49000", + unit_log_get()); +} +TEST_F(homa_grant, homa_grant_adjust_peer__move_peer_downwards) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 40000); + + homa_grant_insert_grantable(rpc); + homa_grant_insert_grantable(test_rpc(self, 200, self->server_ip + 1, + 100000)); + homa_grant_insert_grantable(test_rpc(self, 300, self->server_ip + 2, + 50000)); + homa_grant_insert_grantable(test_rpc(self, 400, self->server_ip + 3, + 80000)); + rpc->msgin.length += 41000; + homa_grant_adjust_peer(self->homa.grant, rpc->peer); + + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 3.2.3.4: id 300 ungranted 49000; " + "peer 4.2.3.4: id 400 ungranted 79000; " + "peer 1.2.3.4: id 100 ungranted 80000; " + "peer 2.2.3.4: id 200 ungranted 99000", + unit_log_get()); +} +TEST_F(homa_grant, homa_grant_adjust_peer__move_peer_to_back) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 50000); + + homa_grant_insert_grantable(rpc); + homa_grant_insert_grantable(test_rpc(self, 200, self->server_ip + 1, + 100000)); + rpc->msgin.length += 55000; + homa_grant_adjust_peer(self->homa.grant, rpc->peer); + + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 2.2.3.4: id 200 ungranted 99000; " + "peer 1.2.3.4: id 100 ungranted 104000", + unit_log_get()); +} + +TEST_F(homa_grant, homa_grant_insert_grantable__insert_in_peer_list) +{ + homa_grant_insert_grantable(test_rpc(self, 100, self->server_ip, + 100000)); + homa_grant_insert_grantable(test_rpc(self, 200, self->server_ip, + 50000)); + homa_grant_insert_grantable(test_rpc(self, 300, self->server_ip, + 1200000)); + homa_grant_insert_grantable(test_rpc(self, 400, self->server_ip, + 70000)); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 1.2.3.4: id 200 ungranted 49000 " + "id 400 ungranted 69000 " + "id 100 ungranted 99000 " + "id 300 ungranted 1199000", + unit_log_get()); +} +TEST_F(homa_grant, homa_grant_insert_grantable__insert_peer_in_grantable_peers) +{ + homa_grant_insert_grantable(test_rpc(self, 200, self->server_ip, + 100000)); + homa_grant_insert_grantable(test_rpc(self, 300, self->server_ip+1, + 50000)); + homa_grant_insert_grantable(test_rpc(self, 400, self->server_ip+2, + 120000)); + homa_grant_insert_grantable(test_rpc(self, 500, self->server_ip+3, + 70000)); + + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 2.2.3.4: id 300 ungranted 49000; " + "peer 4.2.3.4: id 500 ungranted 69000; " + "peer 1.2.3.4: id 200 ungranted 99000; " + "peer 3.2.3.4: id 400 ungranted 119000", + unit_log_get()); +} + +TEST_F(homa_grant, homa_grant_manage_rpc__update_metrics) +{ + self->homa.grant->last_grantable_change = 50; + self->homa.grant->num_grantable_rpcs = 3; + mock_clock = 200; + homa_grant_manage_rpc(test_rpc(self, 100, self->server_ip, 100000)); + EXPECT_EQ(4, self->homa.grant->num_grantable_rpcs); + EXPECT_EQ(450, homa_metrics_per_cpu()->grantable_rpcs_integral); + EXPECT_EQ(200, self->homa.grant->last_grantable_change); +} +TEST_F(homa_grant, homa_grant_manage_rpc__dont_change_max_grantable_rpcs) +{ + self->homa.grant->num_grantable_rpcs = 3; + self->homa.grant->max_grantable_rpcs = 5; + homa_grant_manage_rpc(test_rpc(self, 100, self->server_ip, 100000)); + EXPECT_EQ(4, self->homa.grant->num_grantable_rpcs); + EXPECT_EQ(5, self->homa.grant->max_grantable_rpcs); +} +TEST_F(homa_grant, homa_grant_manage_rpc__insert_and_bump_to_grantables) +{ + struct homa_rpc *rpc1, *rpc2; + + rpc1 = test_rpc(self, 100, self->server_ip, 50000); + rpc2 = test_rpc(self, 102, self->server_ip, 20000); + + self->homa.grant->max_overcommit = 1; + self->homa.grant->last_grantable_change = 50; + self->homa.grant->num_grantable_rpcs = 3; + mock_clock = 200; + homa_grant_manage_rpc(rpc1); + mock_clock = 300; + homa_grant_manage_rpc(rpc2); + EXPECT_EQ(5, self->homa.grant->max_grantable_rpcs); + EXPECT_EQ(850, homa_metrics_per_cpu()->grantable_rpcs_integral); + EXPECT_EQ(300, self->homa.grant->last_grantable_change); + EXPECT_EQ(-1, rpc1->msgin.rank); + EXPECT_EQ(0, rpc2->msgin.rank); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 102 ungranted 19000; " + "peer 1.2.3.4: id 100 ungranted 49000", unit_log_get()); +} +TEST_F(homa_grant, homa_grant_manage_rpc__set_window) +{ + struct homa_rpc *rpc1; + + rpc1 = test_rpc(self, 100, self->server_ip, 50000); + + self->homa.grant->max_incoming = 100000; + self->homa.grant->window_param = 0; + homa_grant_manage_rpc(rpc1); + EXPECT_EQ(50000, self->homa.grant->window); +} + +TEST_F(homa_grant, homa_grant_remove_grantable__not_first_in_peer_list) +{ + struct homa_rpc *rpc = test_rpc(self, 300, self->server_ip, 30000); + + homa_grant_insert_grantable(test_rpc(self, 200, self->server_ip, + 20000)); + homa_grant_insert_grantable(rpc); + homa_grant_insert_grantable(test_rpc(self, 400, self->server_ip+1, + 25000)); + + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 1.2.3.4: id 200 ungranted 19000 " + "id 300 ungranted 29000; " + "peer 2.2.3.4: id 400 ungranted 24000", + unit_log_get()); + + homa_grant_remove_grantable(rpc); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 1.2.3.4: id 200 ungranted 19000; " + "peer 2.2.3.4: id 400 ungranted 24000", + unit_log_get()); +} +TEST_F(homa_grant, homa_grant_remove_grantable__remove_peer_from_grantable_peers) +{ + struct homa_rpc *rpc = test_rpc(self, 200, self->server_ip, 30000); + + homa_grant_insert_grantable(rpc); + homa_grant_insert_grantable(test_rpc(self, 300, self->server_ip+1, + 40000)); + homa_grant_insert_grantable(test_rpc(self, 400, self->server_ip+2, + 20000)); + + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 3.2.3.4: id 400 ungranted 19000; " + "peer 1.2.3.4: id 200 ungranted 29000; " + "peer 2.2.3.4: id 300 ungranted 39000", + unit_log_get()); + + homa_grant_remove_grantable(rpc); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 3.2.3.4: id 400 ungranted 19000; " + "peer 2.2.3.4: id 300 ungranted 39000", + unit_log_get()); +} + +TEST_F(homa_grant, homa_grant_remove_active__copy_existing_rpcs) +{ + struct homa_rpc *rpc; + + homa_grant_manage_rpc(test_rpc(self, 200, self->server_ip, + 50000)); + homa_grant_manage_rpc(test_rpc(self, 300, self->server_ip, + 40000)); + homa_grant_manage_rpc(test_rpc(self, 400, self->server_ip, + 30000)); + homa_grant_manage_rpc(test_rpc(self, 500, self->server_ip, + 20000)); + + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 500 ungranted 19000; " + "active[1]: id 400 ungranted 29000; " + "active[2]: id 300 ungranted 39000; " + "active[3]: id 200 ungranted 49000", + unit_log_get()); + + rpc = self->homa.grant->active_rpcs[0]; + EXPECT_EQ(4, rpc->peer->active_rpcs); + + homa_grant_remove_active(rpc, &self->cand); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 400 ungranted 29000; " + "active[1]: id 300 ungranted 39000; " + "active[2]: id 200 ungranted 49000", + unit_log_get()); + EXPECT_EQ(-1, rpc->msgin.rank); + EXPECT_EQ(3, rpc->peer->active_rpcs); + EXPECT_TRUE(homa_grant_cand_empty(&self->cand)); +} +TEST_F(homa_grant, homa_grant_remove_active__promote_from_grantable) +{ + struct homa_rpc *rpc; + + self->homa.grant->max_overcommit = 2; + homa_grant_manage_rpc(test_rpc(self, 200, self->server_ip, + 50000)); + homa_grant_manage_rpc(test_rpc(self, 300, self->server_ip, + 40000)); + homa_grant_manage_rpc(test_rpc(self, 400, self->server_ip, + 30000)); + homa_grant_manage_rpc(test_rpc(self, 500, self->server_ip, + 20000)); + + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 500 ungranted 19000; " + "active[1]: id 400 ungranted 29000; " + "peer 1.2.3.4: id 300 ungranted 39000 " + "id 200 ungranted 49000", + unit_log_get()); + + rpc = self->homa.grant->active_rpcs[1]; + EXPECT_EQ(2, rpc->peer->active_rpcs); + + homa_grant_remove_active(rpc, &self->cand); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 500 ungranted 19000; " + "active[1]: id 300 ungranted 39000; " + "peer 1.2.3.4: id 200 ungranted 49000", + unit_log_get()); + EXPECT_EQ(-1, rpc->msgin.rank); + EXPECT_EQ(2, rpc->peer->active_rpcs); + EXPECT_FALSE(homa_grant_cand_empty(&self->cand)); +} +TEST_F(homa_grant, homa_grant_remove_active__skip_overactive_peer) +{ + struct homa_rpc *rpc; + + self->homa.grant->max_overcommit = 2; + self->homa.grant->max_rpcs_per_peer = 1; + homa_grant_manage_rpc(test_rpc(self, 200, self->server_ip+1, + 50000)); + homa_grant_manage_rpc(test_rpc(self, 300, self->server_ip+1, + 40000)); + homa_grant_manage_rpc(test_rpc(self, 400, self->server_ip, + 30000)); + homa_grant_manage_rpc(test_rpc(self, 500, self->server_ip, + 20000)); + + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 500 ungranted 19000; " + "active[1]: id 300 ungranted 39000; " + "peer 1.2.3.4: id 400 ungranted 29000; " + "peer 2.2.3.4: id 200 ungranted 49000", + unit_log_get()); + + rpc = self->homa.grant->active_rpcs[1]; + + homa_grant_remove_active(rpc, &self->cand); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 500 ungranted 19000; " + "active[1]: id 200 ungranted 49000; " + "peer 1.2.3.4: id 400 ungranted 29000", + unit_log_get()); + EXPECT_FALSE(homa_grant_cand_empty(&self->cand)); +} + +TEST_F(homa_grant, homa_grant_unmanage_rpc__basics) +{ + struct homa_rpc *rpc; + + self->homa.grant->max_rpcs_per_peer = 1; + self->homa.grant->window_param = 0; + self->homa.grant->max_incoming = 60000; + homa_grant_manage_rpc(test_rpc(self, 100, self->server_ip, + 20000)); + rpc = test_rpc(self, 200, self->server_ip, 30000); + homa_grant_manage_rpc(rpc); + + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 100 ungranted 19000; " + "peer 1.2.3.4: id 200 ungranted 29000", + unit_log_get()); + EXPECT_EQ(2, self->homa.grant->num_grantable_rpcs); + EXPECT_EQ(30000, self->homa.grant->window); + + self->homa.grant->last_grantable_change = 100; + mock_clock = 250; + + homa_grant_unmanage_rpc(rpc, &self->cand); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 100 ungranted 19000", unit_log_get()); + EXPECT_EQ(1, self->homa.grant->num_grantable_rpcs); + EXPECT_EQ(300, homa_metrics_per_cpu()->grantable_rpcs_integral); + EXPECT_EQ(250, self->homa.grant->last_grantable_change); + EXPECT_EQ(30000, self->homa.grant->window); + + homa_grant_unmanage_rpc(self->homa.grant->active_rpcs[0], &self->cand); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(0, self->homa.grant->num_grantable_rpcs); + EXPECT_EQ(60000, self->homa.grant->window); +} +TEST_F(homa_grant, homa_grant_unmanage_rpc__rpc_not_managed) +{ + struct homa_rpc *rpc; + + self->homa.grant->max_rpcs_per_peer = 1; + self->homa.grant->window_param = 0; + self->homa.grant->max_incoming = 60000; + self->homa.grant->last_grantable_change = 100; + mock_clock = 250; + rpc = test_rpc(self, 200, self->server_ip, 30000); + EXPECT_EQ(0, self->homa.grant->num_grantable_rpcs); + + homa_grant_unmanage_rpc(rpc, &self->cand); + EXPECT_EQ(0, self->homa.grant->num_grantable_rpcs); + EXPECT_EQ(0, homa_metrics_per_cpu()->grantable_rpcs_integral); + EXPECT_EQ(100, self->homa.grant->last_grantable_change); +} +TEST_F(homa_grant, homa_grant_unmanage_rpc__remove_from_oldest_rpc) +{ + struct homa_rpc *rpc; + + rpc = test_rpc(self, 200, self->server_ip, 30000); + homa_grant_manage_rpc(rpc); + self->homa.grant->oldest_rpc = rpc; + homa_rpc_hold(rpc); + EXPECT_EQ(2, refcount_read(&rpc->refs)); + + homa_grant_unmanage_rpc(rpc, &self->cand); + EXPECT_EQ(NULL, self->homa.grant->oldest_rpc); + EXPECT_EQ(1, refcount_read(&rpc->refs)); +} + +TEST_F(homa_grant, homa_grant_update_incoming) +{ + struct homa_rpc *rpc; + + rpc = test_rpc(self, 200, self->server_ip, 20000); + + /* Case 1: total_incoming increases. */ + atomic_set(&self->homa.grant->total_incoming, 1000); + rpc->msgin.bytes_remaining = 19000; + rpc->msgin.granted = 3000; + rpc->msgin.rec_incoming = 500; + homa_grant_update_incoming(rpc, self->homa.grant); + EXPECT_EQ(2500, atomic_read(&self->homa.grant->total_incoming)); + EXPECT_EQ(2000, rpc->msgin.rec_incoming); + + /* Case 2: incoming negative. */ + atomic_set(&self->homa.grant->total_incoming, 1000); + rpc->msgin.bytes_remaining = 16000; + rpc->msgin.granted = 3000; + rpc->msgin.rec_incoming = 500; + homa_grant_update_incoming(rpc, self->homa.grant); + EXPECT_EQ(500, atomic_read(&self->homa.grant->total_incoming)); + EXPECT_EQ(0, rpc->msgin.rec_incoming); + + /* Case 3: no change to rec_incoming. */ + atomic_set(&self->homa.grant->total_incoming, 1000); + self->homa.grant->max_incoming = 1000; + rpc->msgin.bytes_remaining = 16000; + rpc->msgin.granted = 4500; + rpc->msgin.rec_incoming = 500; + homa_grant_update_incoming(rpc, self->homa.grant); + EXPECT_EQ(1000, atomic_read(&self->homa.grant->total_incoming)); + EXPECT_EQ(500, rpc->msgin.rec_incoming); +} + +TEST_F(homa_grant, homa_grant_update_granted__basics) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + + rpc->msgin.rank = 1; + self->homa.grant->num_active_rpcs = 4; + EXPECT_EQ(2, homa_grant_update_granted(rpc, self->homa.grant)); + self->homa.grant->num_active_rpcs = 0; + EXPECT_EQ(10000, rpc->msgin.granted); + EXPECT_EQ(INT_MAX, atomic_read(&self->homa.grant->stalled_rank)); +} +TEST_F(homa_grant, homa_grant_update_granted__rpc_idle) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + + rpc->silent_ticks = 2; + EXPECT_EQ(-1, homa_grant_update_granted(rpc, self->homa.grant)); + EXPECT_EQ(1000, rpc->msgin.granted); +} +TEST_F(homa_grant, homa_grant_update_granted__not_active) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + + rpc->msgin.rank = -1; + EXPECT_EQ(-1, homa_grant_update_granted(rpc, self->homa.grant)); + EXPECT_EQ(1000, rpc->msgin.granted); +} +TEST_F(homa_grant, homa_grant_update_granted__already_fully_granted) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + + rpc->msgin.rank = 2; + rpc->msgin.granted = rpc->msgin.length; + EXPECT_EQ(-1, homa_grant_update_granted(rpc, self->homa.grant)); +} +TEST_F(homa_grant, homa_grant_update_granted__end_of_message) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + + /* First call grants remaining bytes in message. */ + rpc->msgin.bytes_remaining = 5000; + rpc->msgin.rank = 2; + EXPECT_EQ(0, homa_grant_update_granted(rpc, self->homa.grant)); + EXPECT_EQ(20000, rpc->msgin.granted); + + /* Second call cannot grant anything additional. */ + EXPECT_EQ(-1, homa_grant_update_granted(rpc, self->homa.grant)); +} +TEST_F(homa_grant, homa_grant_update_granted__insufficient_room_in_incoming) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + + rpc->msgin.bytes_remaining = 5000; + rpc->msgin.rank = 5; + atomic_set(&self->homa.grant->total_incoming, 48000); + EXPECT_EQ(0, homa_grant_update_granted(rpc, self->homa.grant)); + EXPECT_EQ(17000, rpc->msgin.granted); +} +TEST_F(homa_grant, homa_grant_update_granted__incoming_overcommitted) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + + atomic_set(&self->homa.grant->total_incoming, 51000); + atomic_set(&self->homa.grant->stalled_rank, 3); + rpc->msgin.rank = 2; + EXPECT_EQ(-1, homa_grant_update_granted(rpc, self->homa.grant)); + EXPECT_EQ(1000, rpc->msgin.granted); + EXPECT_EQ(2, atomic_read(&self->homa.grant->stalled_rank)); +} +TEST_F(homa_grant, homa_grant_update_granted__incoming_overcommitted_but_stalled_doesnt_change) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + + atomic_set(&self->homa.grant->total_incoming, 51000); + atomic_set(&self->homa.grant->stalled_rank, 3); + rpc->msgin.rank = 4; + EXPECT_EQ(-1, homa_grant_update_granted(rpc, self->homa.grant)); + EXPECT_EQ(1000, rpc->msgin.granted); + EXPECT_EQ(3, atomic_read(&self->homa.grant->stalled_rank)); +} + +TEST_F(homa_grant, homa_grant_send__basics) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + + mock_xmit_log_verbose = 1; + rpc->msgin.granted = 2600; + unit_log_clear(); + homa_grant_send(rpc, 3); + EXPECT_SUBSTR("id 100, offset 2600, grant_prio 3", unit_log_get()); +} + +TEST_F(homa_grant, homa_grant_check_rpc__msgin_not_initialized) +{ + struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->server_port, + 100, 1000, 2000); + + rpc->msgin.bytes_remaining = 500; + rpc->msgin.granted = 1000; + rpc->msgin.rec_incoming = 0; + unit_log_clear(); + homa_grant_check_rpc(rpc); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(0, rpc->msgin.rec_incoming); + EXPECT_EQ(0, atomic_read(&self->homa.grant->total_incoming)); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_calls); +} +TEST_F(homa_grant, homa_grant_check_rpc__rpc_dead) +{ + struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->server_port, + 100, 1000, 20000); + + homa_message_in_init(rpc, 20000, 0); + EXPECT_EQ(0, rpc->msgin.granted); + rpc->state = RPC_DEAD; + + unit_log_clear(); + homa_rpc_lock(rpc); + homa_grant_check_rpc(rpc); + homa_rpc_unlock(rpc); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_calls); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_locked); + EXPECT_EQ(0, rpc->msgin.granted); + rpc->state = RPC_INCOMING; +} +TEST_F(homa_grant, homa_grant_check_rpc__update_incoming_even_if_rpc_no_longer_active) +{ + struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->server_port, + 100, 1000, 2000); + + homa_message_in_init(rpc, 2000, 0); + EXPECT_EQ(0, rpc->msgin.rank); + rpc->msgin.rank = -1; + rpc->msgin.rec_incoming = 100; + atomic_set(&self->homa.grant->total_incoming, 1000); + unit_log_clear(); + homa_grant_check_rpc(rpc); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_calls); + EXPECT_EQ(900, atomic_read(&self->homa.grant->total_incoming)); + EXPECT_EQ(0, rpc->msgin.rec_incoming); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_locked); +} +TEST_F(homa_grant, homa_grant_check_rpc__fix_order) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3; + + rpc1 = test_rpc_init(self, 100, self->server_ip, 20000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 30000); + rpc3 = test_rpc_init(self, 104, self->server_ip, 40000); + EXPECT_EQ(2, rpc3->msgin.rank); + rpc3->msgin.granted = 25000; + rpc3->msgin.bytes_remaining = 15000; + atomic_set(&self->homa.grant->total_incoming, + self->homa.grant->max_incoming - 15000); + mock_clock = self->homa.grant->next_recalc; + + unit_log_clear(); + homa_rpc_lock(rpc2); + homa_grant_check_rpc(rpc2); + homa_rpc_unlock(rpc2); + EXPECT_STREQ("xmit GRANT 35000@2; xmit GRANT 5000@1", unit_log_get()); + EXPECT_EQ(5000, rpc1->msgin.granted); + EXPECT_EQ(0, rpc2->msgin.granted); + EXPECT_EQ(35000, rpc3->msgin.granted); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 104 ungranted 5000; " + "active[1]: id 100 ungranted 15000; " + "active[2]: id 102 ungranted 30000", unit_log_get()); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_locked); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_recalcs); + EXPECT_EQ(40000, self->homa.grant->next_recalc); +} +TEST_F(homa_grant, homa_grant_check_rpc__fast_path) +{ + struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->server_port, + 100, 1000, 20000); + + homa_message_in_init(rpc, 20000, 0); + EXPECT_EQ(0, rpc->msgin.granted); + + /* First call issues a grant. */ + unit_log_clear(); + homa_rpc_lock(rpc); + homa_grant_check_rpc(rpc); + EXPECT_STREQ("xmit GRANT 10000@0", unit_log_get()); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_calls); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_locked); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_recalcs); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_others); + EXPECT_EQ(10000, rpc->msgin.granted); + + /* Second call doesn't issue a grant (nothing has changed). */ + unit_log_clear(); + homa_grant_check_rpc(rpc); + homa_rpc_unlock(rpc); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(2, homa_metrics_per_cpu()->grant_check_calls); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_locked); + EXPECT_EQ(10000, rpc->msgin.granted); +} +TEST_F(homa_grant, homa_grant_check_rpc__skip_fast_path_rpc_not_active) +{ + struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->server_port, + 100, 1000, 20000); + + homa_message_in_init(rpc, 20000, 0); + EXPECT_EQ(0, rpc->msgin.rank); + rpc->msgin.rank = -1; + + unit_log_clear(); + homa_rpc_lock(rpc); + + homa_grant_check_rpc(rpc); + homa_rpc_unlock(rpc); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(0, rpc->msgin.granted); +} +TEST_F(homa_grant, homa_grant_check_rpc__skip_fast_path_because_of_stalled_rpc) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3; + + rpc1 = test_rpc_init(self, 100, self->server_ip, 20000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 30000); + rpc3 = test_rpc_init(self, 104, self->server_ip, 40000); + atomic_set(&self->homa.grant->total_incoming, + self->homa.grant->max_incoming - 15000); + + unit_log_clear(); + atomic_set(&self->homa.grant->stalled_rank, 1); + homa_rpc_lock(rpc3); + homa_grant_check_rpc(rpc3); + homa_rpc_unlock(rpc3); + EXPECT_STREQ("xmit GRANT 10000@1; xmit GRANT 5000@0", unit_log_get()); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_locked); + EXPECT_EQ(2, atomic_read(&self->homa.grant->stalled_rank)); + EXPECT_EQ(0, rpc1->msgin.granted); + EXPECT_EQ(10000, rpc2->msgin.granted); + EXPECT_EQ(5000, rpc3->msgin.granted); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_others); +} +TEST_F(homa_grant, homa_grant_check_rpc__fast_path_grants_to_end_of_message) +{ + struct homa_rpc *rpc = test_rpc_init(self, 100, self->server_ip, 6000); + + EXPECT_EQ(1, self->homa.grant->num_grantable_rpcs); + + unit_log_clear(); + homa_rpc_lock(rpc); + homa_grant_check_rpc(rpc); + homa_rpc_unlock(rpc); + EXPECT_STREQ("xmit GRANT 6000@0", unit_log_get()); + EXPECT_EQ(6000, rpc->msgin.granted); + EXPECT_EQ(-1, rpc->msgin.rank); + EXPECT_EQ(0, self->homa.grant->num_grantable_rpcs); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_locked); +} +TEST_F(homa_grant, homa_grant_check_rpc__fast_path_promote_other_message) +{ + struct homa_rpc *rpc1, *rpc2; + + self->homa.grant->max_overcommit = 1; + rpc1 = test_rpc_init(self, 100, self->server_ip, 8000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 25000); + EXPECT_EQ(0, rpc1->msgin.rank); + EXPECT_EQ(-1, rpc2->msgin.rank); + + unit_log_clear(); + homa_rpc_lock(rpc1); + homa_grant_check_rpc(rpc1); + homa_rpc_unlock(rpc1); + EXPECT_STREQ("xmit GRANT 8000@0; xmit GRANT 10000@0", unit_log_get()); + EXPECT_EQ(8000, rpc1->msgin.granted); + EXPECT_EQ(10000, rpc2->msgin.granted); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 102 ungranted 15000", unit_log_get()); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_locked); +} +TEST_F(homa_grant, homa_grant_check_rpc__fast_path_issue_fifo_grant) +{ + struct homa_rpc *rpc1, *rpc2; + + rpc1 = test_rpc_init(self, 100, self->server_ip, 50000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 100000); + + self->homa.grant->fifo_grant_time = 0; + self->homa.grant->fifo_grant_interval = 10000; + self->homa.grant->fifo_grant_increment = 20000; + self->homa.grant->fifo_fraction = 50; + + unit_log_clear(); + homa_rpc_lock(rpc1); + homa_grant_check_rpc(rpc1); + EXPECT_STREQ("xmit GRANT 10000@1; xmit GRANT 20000@0", unit_log_get()); + EXPECT_EQ(10000, rpc1->msgin.granted); + EXPECT_EQ(20000, rpc2->msgin.granted); + homa_rpc_unlock(rpc1); +} +TEST_F(homa_grant, homa_grant_check_rpc__dont_check_needy_if_incoming_maxed) +{ + struct homa_rpc *rpc; + + test_rpc_init(self, 100, self->server_ip, 20000); + test_rpc_init(self, 102, self->server_ip, 30000); + rpc = test_rpc_init(self, 104, self->server_ip, 40000); + atomic_set(&self->homa.grant->total_incoming, + self->homa.grant->max_incoming); + + unit_log_clear(); + atomic_set(&self->homa.grant->stalled_rank, 0); + homa_rpc_lock(rpc); + homa_grant_check_rpc(rpc); + homa_rpc_unlock(rpc); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_locked); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_others); +} +TEST_F(homa_grant, homa_grant_check_rpc__reread_stalled_rank_before_checking_needy) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3; + + rpc1 = test_rpc_init(self, 100, self->server_ip, 20000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 30000); + rpc3 = test_rpc_init(self, 104, self->server_ip, 40000); + atomic_set(&self->homa.grant->total_incoming, + self->homa.grant->max_incoming - 5000); + + unit_hook_register(grant_check_stalled_hook); + hook_grant = self->homa.grant; + + unit_log_clear(); + atomic_set(&self->homa.grant->stalled_rank, 1); + homa_rpc_lock(rpc3); + homa_grant_check_rpc(rpc3); + homa_rpc_unlock(rpc3); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_others); + EXPECT_EQ(0, atomic_read(&self->homa.grant->stalled_rank)); + EXPECT_EQ(5000, rpc1->msgin.granted); + EXPECT_EQ(0, rpc2->msgin.granted); + EXPECT_EQ(0, rpc3->msgin.granted); +} +TEST_F(homa_grant, homa_grant_check_rpc__skip_rpc_with_too_much_incoming) +{ + struct homa_rpc *rpc2, *rpc3; + + test_rpc_init(self, 100, self->server_ip, 20000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 30000); + rpc3 = test_rpc_init(self, 104, self->server_ip, 40000); + rpc2->msgin.rec_incoming = 10000; + atomic_set(&self->homa.grant->total_incoming, + self->homa.grant->max_incoming - 15000); + atomic_set(&self->homa.grant->stalled_rank, 0); + + homa_rpc_lock(rpc3); + homa_grant_check_rpc(rpc3); + homa_rpc_unlock(rpc3); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 100 ungranted 10000; " + "active[1]: id 102 ungranted 30000; " + "active[2]: id 104 ungranted 35000", unit_log_get()); +} +TEST_F(homa_grant, homa_grant_check_rpc__skip_dead_rpc_when_checking_needy) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3; + int saved_state; + + rpc1 = test_rpc_init(self, 100, self->server_ip, 20000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 30000); + rpc3 = test_rpc_init(self, 104, self->server_ip, 40000); + saved_state = rpc2->state; + rpc2->state = RPC_DEAD; + atomic_set(&self->homa.grant->total_incoming, + self->homa.grant->max_incoming - 15000); + atomic_set(&self->homa.grant->stalled_rank, 0); + + unit_log_clear(); + homa_rpc_lock(rpc3); + homa_grant_check_rpc(rpc3); + homa_rpc_unlock(rpc3); + EXPECT_STREQ("xmit GRANT 10000@2; xmit GRANT 5000@0", unit_log_get()); + EXPECT_EQ(10000, rpc1->msgin.granted); + EXPECT_EQ(0, rpc2->msgin.granted); + EXPECT_EQ(5000, rpc3->msgin.granted); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 100 ungranted 10000; " + "active[1]: id 102 ungranted 30000; " + "active[2]: id 104 ungranted 35000", unit_log_get()); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_locked); + rpc2->state = saved_state; +} + +TEST_F(homa_grant, homa_grant_fix_order) +{ + struct homa_rpc *rpc3, *rpc4; + + test_rpc_init(self, 100, self->server_ip, 20000); + test_rpc_init(self, 102, self->server_ip, 30000); + rpc3 = test_rpc_init(self, 104, self->server_ip, 40000); + rpc4 = test_rpc_init(self, 106, self->server_ip, 50000); + rpc3->msgin.granted = 15000; + rpc3->msgin.bytes_remaining = 25000; + rpc4->msgin.granted = 26000; + rpc4->msgin.bytes_remaining = 24000; + + EXPECT_EQ(1,homa_grant_fix_order(self->homa.grant)); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 100 ungranted 20000; " + "active[1]: id 106 ungranted 24000; " + "active[2]: id 104 ungranted 25000; " + "active[3]: id 102 ungranted 30000", unit_log_get()); + EXPECT_EQ(3, homa_metrics_per_cpu()->grant_priority_bumps); + + /* Second call: nothing changes. */ + EXPECT_EQ(INT_MAX, homa_grant_fix_order(self->homa.grant)); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_EQ(3, homa_metrics_per_cpu()->grant_priority_bumps); +} + +TEST_F(homa_grant, homa_grant_find_oldest__check_grantable_lists) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3; + + rpc1 = test_rpc(self, 100, self->server_ip, 40000); + rpc1->msgin.birth = 100; + rpc2 = test_rpc(self, 102, self->server_ip, 20000); + rpc2->msgin.birth = 200; + rpc3 = test_rpc(self, 104, self->server_ip + 1, 30000); + rpc3->msgin.birth = 300; + homa_grant_insert_grantable(rpc1); + homa_grant_insert_grantable(rpc2); + homa_grant_insert_grantable(rpc3); + + homa_grant_find_oldest(self->homa.grant); + ASSERT_NE(NULL, self->homa.grant->oldest_rpc); + EXPECT_EQ(100, self->homa.grant->oldest_rpc->id); +} +TEST_F(homa_grant, homa_grant_find_oldest__fifo_grant_unused) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3; + + rpc1 = test_rpc(self, 100, self->server_ip, 400000); + rpc1->msgin.birth = 100; + self->homa.grant->fifo_grant_increment = 10000; + rpc1->msgin.rec_incoming = 20000 + self->homa.grant->window; + rpc2 = test_rpc(self, 102, self->server_ip, 20000); + rpc2->msgin.birth = 200; + rpc3 = test_rpc(self, 104, self->server_ip + 1, 30000); + rpc3->msgin.birth = 300; + homa_grant_insert_grantable(rpc1); + homa_grant_insert_grantable(rpc2); + homa_grant_insert_grantable(rpc3); + + homa_grant_find_oldest(self->homa.grant); + ASSERT_NE(NULL, self->homa.grant->oldest_rpc); + EXPECT_EQ(102, self->homa.grant->oldest_rpc->id); +} +TEST_F(homa_grant, homa_grant_find_oldest__check_active_rpcs) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3; + + rpc1 = test_rpc_init(self, 100, self->server_ip, 40000); + rpc1->msgin.birth = 100; + rpc2 = test_rpc_init(self, 102, self->server_ip, 20000); + rpc2->msgin.birth = 200; + rpc3 = test_rpc(self, 104, self->server_ip + 1, 30000); + rpc3->msgin.birth = 300; + homa_grant_insert_grantable(rpc3); + EXPECT_EQ(2, self->homa.grant->num_active_rpcs); + + homa_grant_find_oldest(self->homa.grant); + ASSERT_NE(NULL, self->homa.grant->oldest_rpc); + EXPECT_EQ(100, self->homa.grant->oldest_rpc->id); +} +TEST_F(homa_grant, homa_grant_find_oldest__active_rpc_has_unused_fifo_grant) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3; + + rpc1 = test_rpc_init(self, 100, self->server_ip, 400000); + rpc1->msgin.birth = 100; + self->homa.grant->fifo_grant_increment = 10000; + rpc1->msgin.rec_incoming = 20000 + self->homa.grant->window; + + /* This RPC will be skipped because it has rank 0. */ + rpc2 = test_rpc_init(self, 102, self->server_ip, 20000); + rpc2->msgin.birth = 200; + + rpc3 = test_rpc(self, 104, self->server_ip + 1, 30000); + rpc3->msgin.birth = 300; + homa_grant_insert_grantable(rpc3); + EXPECT_EQ(2, self->homa.grant->num_active_rpcs); + + homa_grant_find_oldest(self->homa.grant); + ASSERT_NE(NULL, self->homa.grant->oldest_rpc); + EXPECT_EQ(104, self->homa.grant->oldest_rpc->id); +} +TEST_F(homa_grant, homa_grant_find_oldest__no_good_candidates) +{ + self->homa.grant->oldest_rpc = + test_rpc(self, 100, self->server_ip, 40000); + homa_grant_find_oldest(self->homa.grant); + EXPECT_EQ(NULL, self->homa.grant->oldest_rpc); +} +TEST_F(homa_grant, homa_grant_find_oldest__take_reference) +{ + struct homa_rpc *rpc; + + rpc = test_rpc(self, 100, self->server_ip, 40000); + homa_grant_insert_grantable(rpc); + EXPECT_EQ(1, refcount_read(&rpc->refs)); + + homa_grant_find_oldest(self->homa.grant); + EXPECT_EQ(rpc, self->homa.grant->oldest_rpc); + EXPECT_EQ(2, refcount_read(&rpc->refs)); +} + +TEST_F(homa_grant, homa_grant_promote_rpc__rpc_is_active) +{ + struct homa_rpc *rpc; + + test_rpc_init(self, 100, self->server_ip, 30000); + rpc = test_rpc_init(self, 102, self->server_ip, 40000); + rpc->msgin.granted += 15000; + EXPECT_EQ(1, rpc->msgin.rank); + + homa_grant_promote_rpc(self->homa.grant, rpc); + EXPECT_EQ(1, rpc->msgin.rank); +} +TEST_F(homa_grant, homa_grant_promote_rpc__promote_into_active_space_available) +{ + struct homa_rpc *rpc1, *rpc2; + + rpc1 = test_rpc_init(self, 100, self->server_ip, 30000); + + rpc2 = test_rpc(self, 102, self->server_ip, 40000); + homa_grant_insert_grantable(rpc2); + + homa_grant_promote_rpc(self->homa.grant, rpc2); + EXPECT_EQ(0, rpc1->msgin.rank); + EXPECT_EQ(1, rpc2->msgin.rank); +} +TEST_F(homa_grant, homa_grant_promote_rpc__promote_into_active_bump_existing) +{ + struct homa_rpc *rpc1, *rpc2; + + self->homa.grant->max_overcommit = 1; + rpc1 = test_rpc_init(self, 100, self->server_ip, 30000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 40000); + EXPECT_EQ(0, rpc1->msgin.rank); + EXPECT_EQ(-1, rpc2->msgin.rank); + rpc2->msgin.granted += 15000; + + homa_grant_promote_rpc(self->homa.grant, rpc2); + EXPECT_EQ(-1, rpc1->msgin.rank); + EXPECT_EQ(0, rpc2->msgin.rank); +} +TEST_F(homa_grant, homa_grant_promote_rpc__promote_within_peer_list) +{ + struct homa_rpc *rpc; + + self->homa.grant->max_overcommit = 1; + test_rpc_init(self, 100, self->server_ip, 30000); + test_rpc_init(self, 102, self->server_ip, 40000); + test_rpc_init(self, 104, self->server_ip, 50000); + test_rpc_init(self, 106, self->server_ip, 60000); + rpc = test_rpc_init(self, 108, self->server_ip, 70000); + rpc->msgin.granted += 25000; + + homa_grant_promote_rpc(self->homa.grant, rpc); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 100 ungranted 30000; " + "peer 1.2.3.4: id 102 ungranted 40000 " + "id 108 ungranted 45000 " + "id 104 ungranted 50000 " + "id 106 ungranted 60000", unit_log_get()); +} +TEST_F(homa_grant, homa_grant_promote_rpc__promote_to_top_of_peer_list_and_adjust_peer) +{ + struct homa_rpc *rpc; + + self->homa.grant->max_overcommit = 1; + test_rpc_init(self, 100, self->server_ip, 30000); + test_rpc_init(self, 102, self->server_ip + 1, 40000); + test_rpc_init(self, 104, self->server_ip + 2, 50000); + test_rpc_init(self, 106, self->server_ip + 2, 60000); + rpc = test_rpc_init(self, 108, self->server_ip + 2, 70000); + rpc->msgin.granted += 35000; + + homa_grant_promote_rpc(self->homa.grant, rpc); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 100 ungranted 30000; " + "peer 3.2.3.4: id 108 ungranted 35000 " + "id 104 ungranted 50000 " + "id 106 ungranted 60000; " + "peer 2.2.3.4: id 102 ungranted 40000", unit_log_get()); +} + +TEST_F(homa_grant, homa_grant_check_fifo__basics) +{ + struct homa_rpc *rpc; + + mock_clock = 1000; + self->homa.num_priorities = 5; + self->homa.grant->max_overcommit = 1; + self->homa.grant->fifo_grant_time = 0; + self->homa.grant->fifo_grant_interval = 10000; + self->homa.grant->fifo_grant_increment = 20000; + self->homa.grant->fifo_fraction = 50; + + test_rpc_init(self, 100, self->server_ip, 30000); + rpc = test_rpc_init(self, 102, self->server_ip, 400000); + EXPECT_EQ(-1, rpc->msgin.rank); + EXPECT_EQ(0, rpc->msgin.granted); + + unit_log_clear(); + homa_grant_check_fifo(self->homa.grant); + EXPECT_EQ(20000, rpc->msgin.granted); + EXPECT_STREQ("xmit GRANT 20000@3", unit_log_get()); + EXPECT_EQ(rpc, self->homa.grant->oldest_rpc); + EXPECT_EQ(11000, self->homa.grant->fifo_grant_time); + EXPECT_EQ(20000, rpc->msgin.rec_incoming); + EXPECT_EQ(20000, atomic_read(&self->homa.grant->total_incoming)); + EXPECT_EQ(20000, homa_metrics_per_cpu()->fifo_grant_bytes); +} +TEST_F(homa_grant, homa_grant_check_fifo__not_yet_time_for_a_fifo_grant) +{ + struct homa_rpc *rpc; + + mock_clock = 1000; + self->homa.grant->max_overcommit = 1; + self->homa.grant->fifo_grant_time = 1001; + self->homa.grant->fifo_grant_increment = 20000; + + test_rpc_init(self, 100, self->server_ip, 30000); + rpc = test_rpc_init(self, 102, self->server_ip, 400000); + EXPECT_EQ(0, rpc->msgin.granted); + + unit_log_clear(); + homa_grant_check_fifo(self->homa.grant); + EXPECT_EQ(0, rpc->msgin.granted); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(NULL, self->homa.grant->oldest_rpc); + EXPECT_EQ(1001, self->homa.grant->fifo_grant_time); +} +TEST_F(homa_grant, homa_grant_check_fifo__fifo_grants_disabled) +{ + struct homa_rpc *rpc; + + mock_clock = 1000; + self->homa.grant->max_overcommit = 1; + self->homa.grant->fifo_grant_time = 1000; + self->homa.grant->fifo_grant_increment = 0; + self->homa.grant->fifo_grant_interval = 2000; + self->homa.grant->fifo_fraction = 50; + + test_rpc_init(self, 100, self->server_ip, 30000); + rpc = test_rpc_init(self, 102, self->server_ip, 400000); + EXPECT_EQ(0, rpc->msgin.granted); + + unit_log_clear(); + homa_grant_check_fifo(self->homa.grant); + EXPECT_EQ(0, rpc->msgin.granted); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(NULL, self->homa.grant->oldest_rpc); + EXPECT_EQ(3000, self->homa.grant->fifo_grant_time); +} +TEST_F(homa_grant, homa_grant_check_fifo__oldest_rpc_not_responsive) +{ + struct homa_rpc *rpc1, *rpc2; + + mock_clock = 1000; + self->homa.grant->max_overcommit = 1; + self->homa.grant->fifo_grant_time = 1000; + self->homa.grant->fifo_grant_increment = 20000; + self->homa.grant->fifo_fraction = 50; + + mock_clock = 1000; + test_rpc_init(self, 100, self->server_ip, 30000); + mock_clock = 2000; + rpc1 = test_rpc_init(self, 102, self->server_ip, 400000); + mock_clock = 3000; + rpc2 = test_rpc_init(self, 104, self->server_ip, 300000); + homa_grant_find_oldest(self->homa.grant); + EXPECT_EQ(102, self->homa.grant->oldest_rpc->id); + rpc1->msgin.rec_incoming = 40000 + self->homa.grant->window; + + unit_log_clear(); + homa_grant_check_fifo(self->homa.grant); + EXPECT_EQ(0, rpc1->msgin.granted); + EXPECT_EQ(20000, rpc2->msgin.granted); + EXPECT_STREQ("xmit GRANT 20000@0", unit_log_get()); + EXPECT_EQ(104, self->homa.grant->oldest_rpc->id); +} +TEST_F(homa_grant, homa_grant_check_fifo__no_suitable_rpc) +{ + mock_clock = 1000; + self->homa.grant->max_overcommit = 1; + self->homa.grant->fifo_grant_time = 1000; + self->homa.grant->fifo_grant_increment = 20000; + self->homa.grant->fifo_fraction = 50; + + test_rpc_init(self, 100, self->server_ip, 30000); + + unit_log_clear(); + homa_grant_check_fifo(self->homa.grant); + EXPECT_EQ(NULL, self->homa.grant->oldest_rpc); + EXPECT_STREQ("", unit_log_get()); +} +TEST_F(homa_grant, homa_grant_check_fifo__rpc_dead) +{ + struct homa_rpc *rpc; + + mock_clock = 1000; + self->homa.grant->max_overcommit = 1; + self->homa.grant->fifo_grant_time = 0; + self->homa.grant->fifo_grant_increment = 20000; + self->homa.grant->fifo_fraction = 50; + + test_rpc_init(self, 100, self->server_ip, 30000); + rpc = test_rpc_init(self, 102, self->server_ip, 400000); + EXPECT_EQ(-1, rpc->msgin.rank); + EXPECT_EQ(0, rpc->msgin.granted); + self->homa.grant->oldest_rpc = rpc; + homa_rpc_hold(rpc); + hook_end_rpc = rpc; + hook_end_lock_count = 2; + unit_hook_register(grant_spinlock_end_hook); + + unit_log_clear(); + homa_grant_check_fifo(self->homa.grant); + EXPECT_EQ(0, rpc->msgin.granted); + EXPECT_EQ(0, homa_metrics_per_cpu()->fifo_grant_bytes); + EXPECT_EQ(RPC_DEAD, rpc->state); +} +TEST_F(homa_grant, homa_grant_check_fifo__rpc_becomes_fully_granted_so_promote_another) +{ + struct homa_rpc *rpc; + + self->homa.grant->max_overcommit = 2; + self->homa.grant->fifo_grant_increment = 50000; + self->homa.grant->fifo_fraction = 50; + + mock_clock = 1000; + rpc = test_rpc_init(self, 100, self->server_ip, 40000); + mock_clock = 2000; + test_rpc_init(self, 102, self->server_ip, 30000); + mock_clock = 3000; + test_rpc_init(self, 104, self->server_ip, 50000); + EXPECT_EQ(1, rpc->msgin.rank); + + unit_log_clear(); + homa_grant_check_fifo(self->homa.grant); + EXPECT_EQ(40000, rpc->msgin.granted); + EXPECT_EQ(-1, rpc->msgin.rank); + EXPECT_STREQ("xmit GRANT 40000@0; xmit GRANT 10000@0", unit_log_get()); + EXPECT_EQ(NULL, self->homa.grant->oldest_rpc); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 102 ungranted 30000; " + "active[1]: id 104 ungranted 40000", unit_log_get()); + EXPECT_EQ(40000, homa_metrics_per_cpu()->fifo_grant_bytes); +} +TEST_F(homa_grant, homa_grant_check_fifo__promote_after_fifo_grant) +{ + struct homa_rpc *rpc; + + self->homa.grant->max_overcommit = 1; + self->homa.grant->fifo_grant_increment = 15000; + self->homa.grant->fifo_fraction = 50; + + mock_clock = 1000; + rpc = test_rpc_init(self, 100, self->server_ip, 50000); + mock_clock = 2000; + test_rpc_init(self, 102, self->server_ip, 30000); + mock_clock = 3000; + test_rpc_init(self, 104, self->server_ip, 40000); + + unit_log_clear(); + homa_grant_check_fifo(self->homa.grant); + EXPECT_EQ(15000, rpc->msgin.granted); + EXPECT_STREQ("xmit GRANT 15000@0", unit_log_get()); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 102 ungranted 30000; " + "peer 1.2.3.4: id 100 ungranted 35000 " + "id 104 ungranted 40000", unit_log_get()); +} + +TEST_F(homa_grant, homa_grant_cand_add__basics) +{ + struct homa_grant_candidates cand; + struct homa_rpc *rpc1, *rpc2; + + rpc1 = test_rpc(self, 100, self->server_ip, 20000); + rpc2 = test_rpc(self, 102, self->server_ip, 30000); + + homa_grant_cand_init(&cand); + homa_grant_cand_add(&cand, rpc2); + homa_grant_cand_add(&cand, rpc1); + EXPECT_EQ(2, cand.inserts); + EXPECT_EQ(0, cand.removes); + EXPECT_EQ(rpc2, cand.rpcs[0]); + EXPECT_EQ(rpc1, cand.rpcs[1]); + EXPECT_EQ(2, refcount_read(&rpc1->refs)); + homa_grant_cand_check(&cand, self->homa.grant); +} +TEST_F(homa_grant, homa_grant_cand_add__wrap_around) +{ + struct homa_grant_candidates cand; + int i; + + homa_grant_cand_init(&cand); + + /* Add so many RPCs that some have to be dropped. */ + for (i = 0; i < HOMA_MAX_CAND_RPCS + 2; i++) + homa_grant_cand_add(&cand, test_rpc(self, 100 + 2*i, + self->server_ip, 20000)); + EXPECT_EQ(HOMA_MAX_CAND_RPCS, cand.inserts); + EXPECT_EQ(0, cand.removes); + EXPECT_EQ(100, cand.rpcs[0]->id); + EXPECT_EQ(114, cand.rpcs[HOMA_MAX_CAND_RPCS-1]->id); + + /* Discard a couple of RPCs then add more. */ + homa_rpc_put(cand.rpcs[0]); + homa_rpc_put(cand.rpcs[1]); + cand.removes = 2; + for (i = 0; i < 3; i++) + homa_grant_cand_add(&cand, test_rpc(self, 200 + 2*i, + self->server_ip, 20000)); + EXPECT_EQ(HOMA_MAX_CAND_RPCS + 2, cand.inserts); + EXPECT_EQ(2, cand.removes); + EXPECT_EQ(200, cand.rpcs[0]->id); + EXPECT_EQ(202, cand.rpcs[1]->id); + EXPECT_EQ(104, cand.rpcs[2]->id); + homa_grant_cand_check(&cand, self->homa.grant); +} + +TEST_F(homa_grant, homa_grant_cand_check__basics) +{ + struct homa_grant_candidates cand; + struct homa_rpc *rpc1, *rpc2, *rpc3; + + rpc1 = test_rpc_init(self, 100, self->server_ip, 20000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 20000); + rpc3 = test_rpc_init(self, 104, self->server_ip, 20000); + + homa_grant_cand_init(&cand); + homa_grant_cand_add(&cand, rpc1); + homa_grant_cand_add(&cand, rpc2); + homa_grant_cand_add(&cand, rpc3); + rpc2->msgin.granted = 20000; + unit_log_clear(); + homa_grant_cand_check(&cand, self->homa.grant); + EXPECT_STREQ("xmit GRANT 10000@2; xmit GRANT 10000@0", unit_log_get()); + EXPECT_EQ(1, refcount_read(&rpc1->refs)); + EXPECT_EQ(1, refcount_read(&rpc2->refs)); + EXPECT_EQ(1, refcount_read(&rpc3->refs)); +} +TEST_F(homa_grant, homa_grant_cand_check__rpc_dead) +{ + struct homa_grant_candidates cand; + struct homa_rpc *rpc; + int saved_state; + + rpc = test_rpc_init(self, 100, self->server_ip, 20000); + + homa_grant_cand_init(&cand); + homa_grant_cand_add(&cand, rpc); + saved_state = rpc->state; + rpc->state = RPC_DEAD; + + unit_log_clear(); + homa_grant_cand_check(&cand, self->homa.grant); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(1, refcount_read(&rpc->refs)); + rpc->state = saved_state; +} +TEST_F(homa_grant, homa_grant_cand_check__rpc_becomes_fully_granted) +{ + struct homa_grant_candidates cand; + struct homa_rpc *rpc1, *rpc2; + + self->homa.grant->max_rpcs_per_peer = 1; + rpc1 = test_rpc_init(self, 100, self->server_ip, 20000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 30000); + EXPECT_EQ(0, rpc1->msgin.rank); + EXPECT_EQ(-1, rpc2->msgin.rank); + rpc1->msgin.bytes_remaining = 10000; + + homa_grant_cand_init(&cand); + homa_grant_cand_add(&cand, rpc1); + + unit_log_clear(); + homa_grant_cand_check(&cand, self->homa.grant); + EXPECT_STREQ("xmit GRANT 20000@0; xmit GRANT 10000@0", unit_log_get()); + EXPECT_EQ(-1, rpc1->msgin.rank); + EXPECT_EQ(0, rpc2->msgin.rank); + EXPECT_EQ(2, cand.removes); +} + +TEST_F(homa_grant, homa_grant_lock_slow) +{ + mock_clock = 500; + unit_hook_register(grant_spinlock_hook); + + homa_grant_lock_slow(self->homa.grant); + homa_grant_unlock(self->homa.grant); + + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_lock_misses); + EXPECT_EQ(500, homa_metrics_per_cpu()->grant_lock_miss_cycles); +} + +TEST_F(homa_grant, homa_grant_update_sysctl_deps__max_overcommit) +{ + self->homa.grant->max_overcommit = 2; + homa_grant_update_sysctl_deps(self->homa.grant); + EXPECT_EQ(2, self->homa.grant->max_overcommit); + + self->homa.grant->max_overcommit = HOMA_MAX_GRANTS; + homa_grant_update_sysctl_deps(self->homa.grant); + EXPECT_EQ(HOMA_MAX_GRANTS, self->homa.grant->max_overcommit); + + self->homa.grant->max_overcommit = HOMA_MAX_GRANTS+1; + homa_grant_update_sysctl_deps(self->homa.grant); + EXPECT_EQ(HOMA_MAX_GRANTS, self->homa.grant->max_overcommit); +} +TEST_F(homa_grant, homa_grant_update_sysctl_deps__fifo_fraction) +{ + self->homa.grant->fifo_fraction = 499; + homa_grant_update_sysctl_deps(self->homa.grant); + EXPECT_EQ(499, self->homa.grant->fifo_fraction); + + self->homa.grant->fifo_fraction = 501; + homa_grant_update_sysctl_deps(self->homa.grant); + EXPECT_EQ(500, self->homa.grant->fifo_fraction); +} +TEST_F(homa_grant, homa_grant_update_sysctl_deps__fifo_interval) +{ + self->homa.grant->fifo_grant_increment = 20000; + self->homa.grant->fifo_fraction = 500; + self->homa.link_mbps = 8000; + homa_grant_update_sysctl_deps(self->homa.grant); + EXPECT_EQ(40000, self->homa.grant->fifo_grant_interval); +} +TEST_F(homa_grant, homa_grant_update_sysctl_deps__fifo_interval_no_fifo_grants) +{ + self->homa.grant->fifo_grant_increment = 20000; + self->homa.grant->fifo_fraction = 0; + self->homa.link_mbps = 8000; + homa_grant_update_sysctl_deps(self->homa.grant); + EXPECT_EQ(1000000000, self->homa.grant->fifo_grant_interval); +} +TEST_F(homa_grant, homa_grant_update_sysctl_deps__recalc_cycles) +{ + self->homa.grant->recalc_usecs = 7; + homa_grant_update_sysctl_deps(self->homa.grant); + EXPECT_EQ(7000, self->homa.grant->recalc_cycles); +} +TEST_F(homa_grant, homa_grant_update_sysctl_deps__grant_window) +{ + self->homa.grant->window_param = 30000; + homa_grant_update_sysctl_deps(self->homa.grant); + EXPECT_EQ(30000, self->homa.grant->window); +} diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 10c303cd..4ce7e5e0 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -1,127 +1,86 @@ -/* Copyright (c) 2019-2023 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" -#include "homa_lcache.h" +#include "homa_grant.h" +#include "homa_interest.h" +#include "homa_peer.h" +#include "homa_pool.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" #include "mock.h" #include "utils.h" -/* The following variable (and hook function) are used to mark an RPC - * ready with an error (but only if thread is sleeping). - */ -struct homa_rpc *hook_rpc = NULL; -struct homa_sock *hook_hsk = NULL; -int delete_count = 0; -void handoff_hook(char *id) +#ifndef __STRIP__ /* See strip.py */ +#include "homa_offload.h" +#include "homa_pacer.h" +#endif /* See strip.py */ + +#ifndef __STRIP__ /* See strip.py */ +#define XMIT_DATA(rpc, force) homa_xmit_data(rpc, force) +#else /* See strip.py */ +#define XMIT_DATA(rpc, force) homa_xmit_data(rpc) +#endif /* See strip.py */ + +static struct homa_rpc *hook_rpc; +static int delete_count; +static int lock_delete_count; +static int hook_count; +static struct homa_sock *hook_shutdown_hsk; + +static void wait_hook4(char *id) { - if (strcmp(id, "schedule") != 0) - return; - if (task_is_running(current)) + if (strcmp(id, "schedule") != 0 && + strcmp(id, "do_wait_intr_irq") != 0 && + strcmp(id, "prepare_to_wait") != 0) return; - hook_rpc->error = -EFAULT; - homa_rpc_handoff(hook_rpc); - unit_log_printf("; ", - "%d in ready_requests, %d in ready_responses, " - "%d in request_interests, %d in response_interests", - unit_list_length(&hook_rpc->hsk->ready_requests), - unit_list_length(&hook_rpc->hsk->ready_responses), - unit_list_length(&hook_rpc->hsk->request_interests), - unit_list_length(&hook_rpc->hsk->response_interests)); -} - -/* The following hook function marks an RPC ready after several calls. */ -int poll_count = 0; -void poll_hook(char *id) -{ - if (strcmp(id, "schedule") != 0) + if (hook_count <= 0) return; - if (poll_count <= 0) + hook_count--; + if (hook_count != 0) return; - poll_count--; - if (poll_count == 0) { - hook_rpc->error = -EFAULT; + if (hook_shutdown_hsk) + unit_sock_destroy(hook_shutdown_hsk); + else homa_rpc_handoff(hook_rpc); - } -} - -/* The following hook function hands off an RPC (with an error). */ -void handoff_hook2(char *id) -{ - if (strcmp(id, "found_rpc") != 0) - return; - - hook_rpc->error = -ETIMEDOUT; - homa_rpc_handoff(hook_rpc); } -/* The following hook function first hands off an RPC, then deletes it. */ -int hook3_count = 0; -void handoff_hook3(char *id) +static void handoff_hook(char *id) { - if (hook3_count || (strcmp(id, "found_rpc") != 0)) + if (strcmp(id, "spin_lock") != 0) return; - hook3_count++; - - homa_rpc_handoff(hook_rpc); - homa_rpc_free(hook_rpc); -} - -/* The following hook function deletes an RPC. */ -void delete_hook(char *id) -{ - if (strcmp(id, "schedule") != 0) + if (hook_count <= 0) return; - if (delete_count == 0) { - homa_rpc_free(hook_rpc); + hook_count--; + if (hook_count == 0) { + hook_rpc->error = -ENOENT; + homa_rpc_handoff(hook_rpc); } - delete_count--; } -/* The following function is used via unit_hook to delete an RPC after it - * has been matched in homa_wait_for_message. */ -void match_delete_hook(char *id) -{ - if (strcmp(id, "found_rpc") == 0) - homa_rpc_free(hook_rpc); -} -/* The following hook function shuts down a socket. */ -void shutdown_hook(char *id) +#ifdef __STRIP__ /* See strip.py */ +int mock_message_in_init(struct homa_rpc *rpc, int length, int unsched) { - if (strcmp(id, "schedule") != 0) - return; - homa_sock_shutdown(hook_hsk); + return homa_message_in_init(rpc, length); } +#define homa_message_in_init(rpc, length, unsched) \ + mock_message_in_init(rpc, length, unsched) +#endif /* See strip.py */ FIXTURE(homa_incoming) { struct in6_addr client_ip[5]; int client_port; struct in6_addr server_ip[2]; int server_port; - __u64 client_id; - __u64 server_id; - sockaddr_in_union server_addr; + u64 client_id; + u64 server_id; + union sockaddr_in_union server_addr; struct homa homa; + struct homa_net *hnet; struct homa_sock hsk; - struct data_header data; - struct homa_interest interest; - struct homa_lcache lcache; - int incoming_delta; + struct homa_sock hsk2; + struct homa_data_hdr data; }; FIXTURE_SETUP(homa_incoming) { @@ -137,2062 +96,2054 @@ FIXTURE_SETUP(homa_incoming) self->client_id = 1234; self->server_id = 1235; homa_init(&self->homa); + self->hnet = mock_hnet(0, &self->homa); +#ifndef __STRIP__ /* See strip.py */ self->homa.num_priorities = 1; self->homa.poll_cycles = 0; self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; - self->homa.pacer_fifo_fraction = 0; - self->homa.grant_fifo_fraction = 0; - self->homa.grant_threshold = self->homa.rtt_bytes; - mock_sock_init(&self->hsk, &self->homa, 0); + self->homa.qshared->fifo_fraction = 0; + self->homa.unsched_bytes = 10000; + self->homa.grant->window = 10000; +#endif /* See strip.py */ + mock_sock_init(&self->hsk, self->hnet, 0); + mock_sock_init(&self->hsk2, self->hnet, self->server_port); self->server_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; self->server_addr.in6.sin6_addr = self->server_ip[0]; self->server_addr.in6.sin6_port = htons(self->server_port); - self->data = (struct data_header){.common = { - .sport = htons(self->client_port), - .dport = htons(self->server_port), - .type = DATA, - .sender_id = cpu_to_be64(self->client_id)}, - .message_length = htonl(10000), - .incoming = htonl(10000), .cutoff_version = 0, - .retransmit = 0, - .seg = {.offset = 0, .segment_length = htonl(1400), - .ack = {0, 0, 0}}}; + memset(&self->data, 0, sizeof(self->data)); + self->data.common = (struct homa_common_hdr){ + .sport = htons(self->client_port), + .dport = htons(self->server_port), + .type = DATA, + .sender_id = cpu_to_be64(self->client_id) + }; + self->data.message_length = htonl(10000); +#ifndef __STRIP__ /* See strip.py */ + self->data.incoming = htonl(10000); +#endif /* See strip.py */ unit_log_clear(); delete_count = 0; - homa_lcache_init(&self->lcache); - self->incoming_delta = 0; + lock_delete_count = 0; + hook_shutdown_hsk = NULL; } FIXTURE_TEARDOWN(homa_incoming) { - homa_lcache_release(&self->lcache); homa_destroy(&self->homa); unit_teardown(); } -TEST_F(homa_incoming, homa_message_in_init) -{ - struct homa_message_in msgin; - homa_message_in_init(&msgin, 127, 100); - EXPECT_EQ(1, msgin.scheduled); - EXPECT_EQ(100, msgin.incoming); - homa_message_in_init(&msgin, 128, 500); - EXPECT_EQ(128, msgin.incoming); - EXPECT_EQ(0, msgin.scheduled); - homa_message_in_init(&msgin, 130, 0); - homa_message_in_init(&msgin, 0xfff, 0); - homa_message_in_init(&msgin, 0xfff0, 0); - homa_message_in_init(&msgin, 0x3000, 0); - homa_message_in_init(&msgin, 1000000, 0); - homa_message_in_init(&msgin, 2000000, 0); - EXPECT_EQ(255, homa_cores[cpu_number]->metrics.small_msg_bytes[1]); - EXPECT_EQ(130, homa_cores[cpu_number]->metrics.small_msg_bytes[2]); - EXPECT_EQ(0xfff, homa_cores[cpu_number]->metrics.small_msg_bytes[63]); - EXPECT_EQ(0x3000, homa_cores[cpu_number]->metrics.medium_msg_bytes[11]); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.medium_msg_bytes[15]); - EXPECT_EQ(3000000, homa_cores[cpu_number]->metrics.large_msg_bytes); -} - -TEST_F(homa_incoming, homa_add_packet__basics) +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_incoming, homa_message_in_init__basics) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); - homa_message_in_init(&crpc->msgin, 10000, 0); - unit_log_clear(); - self->data.seg.offset = htonl(1400); - homa_add_packet(crpc, mock_skb_new(self->client_ip, - &self->data.common, 1400, 1400)); - self->data.seg.offset = htonl(4200); - self->data.seg.segment_length = htonl(800); - homa_add_packet(crpc, mock_skb_new(self->client_ip, - &self->data.common, 800, 4200)); - - self->data.seg.offset = 0; - self->data.seg.segment_length = htonl(1400); - homa_add_packet(crpc, mock_skb_new(self->client_ip, - &self->data.common, 1400, 0)); - unit_log_skb_list(&crpc->msgin.packets, 0); - EXPECT_STREQ("DATA 1400@0; DATA 1400@1400; DATA 800@4200", - unit_log_get()); - EXPECT_EQ(6400, crpc->msgin.bytes_remaining); + mock_clock = 200; + EXPECT_EQ(0, homa_message_in_init(crpc, 127, 100)); + EXPECT_EQ(100, crpc->msgin.granted); + EXPECT_EQ(200, crpc->msgin.birth); + EXPECT_EQ(0, homa_message_in_init(crpc, 128, 500)); + EXPECT_EQ(128, crpc->msgin.granted); + EXPECT_EQ(1, crpc->msgin.num_bpages); +} +#endif /* See strip.py */ +TEST_F(homa_incoming, homa_message_in_init__message_too_long) +{ + struct homa_rpc *srpc; + int created; - unit_log_clear(); - self->data.seg.offset = htonl(2800); - homa_add_packet(crpc, mock_skb_new(self->client_ip, - &self->data.common, 1400, 2800)); - unit_log_skb_list(&crpc->msgin.packets, 0); - EXPECT_STREQ("DATA 1400@0; DATA 1400@1400; DATA 1400@2800; " - "DATA 800@4200", unit_log_get()); + self->data.message_length = htonl(HOMA_MAX_MESSAGE_LENGTH+1); + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, + &created); + ASSERT_TRUE(IS_ERR(srpc)); + EXPECT_EQ(EINVAL, -PTR_ERR(srpc)); } -TEST_F(homa_incoming, homa_add_packet__ignore_resends_of_copied_out_data) +TEST_F(homa_incoming, homa_message_in_init__no_buffer_region) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); - homa_message_in_init(&crpc->msgin, 10000, 0); - unit_log_clear(); - crpc->msgin.copied_out = 1500; - homa_add_packet(crpc, mock_skb_new(self->client_ip, - &self->data.common, 1400, 0)); - unit_log_skb_list(&crpc->msgin.packets, 0); - EXPECT_STREQ("", - unit_log_get()); - EXPECT_EQ(10000, crpc->msgin.bytes_remaining); + + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); + EXPECT_EQ(ENOMEM, -homa_message_in_init(crpc, HOMA_BPAGE_SIZE*2, 0)); + EXPECT_EQ(0, crpc->msgin.num_bpages); + EXPECT_EQ(-1, crpc->msgin.length); } -TEST_F(homa_incoming, homa_add_packet__varying_sizes) +TEST_F(homa_incoming, homa_message_in_init__no_buffers_available) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); - homa_message_in_init(&crpc->msgin, 10000, 0); - unit_log_clear(); - self->data.seg.offset = 0; - self->data.seg.segment_length = htonl(4000); - homa_add_packet(crpc, mock_skb_new(self->client_ip, - &self->data.common, 4000, 0)); - self->data.seg.offset = htonl(4000); - self->data.seg.segment_length = htonl(6000); - homa_add_packet(crpc, mock_skb_new(self->client_ip, - &self->data.common, 6000, 4000)); - unit_log_skb_list(&crpc->msgin.packets, 0); - EXPECT_STREQ("DATA 4000@0; DATA 6000@4000", - unit_log_get()); - EXPECT_EQ(0, crpc->msgin.bytes_remaining); + atomic_set(&self->hsk.buffer_pool->free_bpages, 0); + EXPECT_EQ(0, homa_message_in_init(crpc, HOMA_BPAGE_SIZE*2, 10000)); + EXPECT_EQ(0, crpc->msgin.num_bpages); } -TEST_F(homa_incoming, homa_add_packet__redundant_packets) +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_incoming, homa_message_in_init__update_message_length_metrics) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); - homa_message_in_init(&crpc->msgin, 10000, 0); - unit_log_clear(); - self->data.seg.offset = htonl(1400); - homa_add_packet(crpc, mock_skb_new(self->client_ip, - &self->data.common, 1400, 1400)); - EXPECT_EQ(1, crpc->msgin.num_skbs); - /* Duplicate packet. */ - homa_add_packet(crpc, mock_skb_new(self->client_ip, - &self->data.common, 1400, 1400)); - unit_log_skb_list(&crpc->msgin.packets, 0); - EXPECT_STREQ("DATA 1400@1400", unit_log_get()); - EXPECT_EQ(1, crpc->msgin.num_skbs); + EXPECT_EQ(0, homa_message_in_init(crpc, 140, 140)); + EXPECT_EQ(0, homa_message_in_init(crpc, 130, 130)); + EXPECT_EQ(0, homa_message_in_init(crpc, 0xfff, 0xfff)); + EXPECT_EQ(0, homa_message_in_init(crpc, 0xfff0, 0xfff0)); + EXPECT_EQ(0, homa_message_in_init(crpc, 0x3000, 0x3000)); + EXPECT_EQ(0, homa_message_in_init(crpc, 1000000, 1000000)); + EXPECT_EQ(0, homa_message_in_init(crpc, 900000, 900000)); + EXPECT_EQ(270, homa_metrics_per_cpu()->small_msg_bytes[2]); + EXPECT_EQ(0xfff, homa_metrics_per_cpu()->small_msg_bytes[63]); + EXPECT_EQ(0x3000, homa_metrics_per_cpu()->medium_msg_bytes[11]); + EXPECT_EQ(0, homa_metrics_per_cpu()->medium_msg_bytes[15]); + EXPECT_EQ(1900000, homa_metrics_per_cpu()->large_msg_bytes); +} +TEST_F(homa_incoming, homa_message_in_init__update_client_rpc_metrics) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, 98, 1000, 1000); - /* Top of new overlaps bottom of old. */ - unit_log_clear(); - self->data.seg.offset = htonl(1000); - homa_add_packet(crpc, mock_skb_new(self->client_ip, - &self->data.common, 1400, 1400)); - unit_log_skb_list(&crpc->msgin.packets, 0); - EXPECT_STREQ("DATA 1400@1400", unit_log_get()); - EXPECT_EQ(1, crpc->msgin.num_skbs); + EXPECT_EQ(0, homa_message_in_init(crpc, 5000, 1000)); + EXPECT_EQ(1, homa_metrics_per_cpu()->client_responses_started); + EXPECT_EQ(5000, homa_metrics_per_cpu()->client_response_bytes_started); +} +TEST_F(homa_incoming, homa_message_in_init__update_server_rpc_metrics) +{ + struct homa_rpc *srpc; - /* Bottom of new overlaps top of old. */ - unit_log_clear(); - self->data.seg.offset = htonl(2000); - homa_add_packet(crpc, mock_skb_new(self->client_ip, - &self->data.common, 1400, 1400)); - unit_log_skb_list(&crpc->msgin.packets, 0); - EXPECT_STREQ("DATA 1400@1400", unit_log_get()); - EXPECT_EQ(1, crpc->msgin.num_skbs); + srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, self->client_port, + self->server_id, 4000, 10000); + EXPECT_FALSE(srpc == NULL); + EXPECT_EQ(1, homa_metrics_per_cpu()->server_requests_started); + EXPECT_EQ(4000, homa_metrics_per_cpu()->server_request_bytes_started); } +#endif /* See strip.py */ -TEST_F(homa_incoming, homa_copy_to_user__basics) +TEST_F(homa_incoming, homa_request_retrans__request_gaps) { - struct homa_rpc *crpc; - - mock_bpage_size = 2048; - mock_bpage_shift = 11; - EXPECT_EQ(0, -homa_pool_init(&self->hsk.buffer_pool, &self->homa, - (void *) 0x1000000, 100*HOMA_BPAGE_SIZE)); - crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->server_port, self->client_id, - 1000, 4000); - ASSERT_NE(NULL, crpc); - self->data.message_length = htonl(4000); - self->data.seg.offset = htonl(1400); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, - 1400, 101000), crpc, NULL, &self->incoming_delta); - self->data.seg.offset = htonl(2800); - self->data.seg.segment_length = htonl(1200); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, - 1400, 201800), crpc, NULL, &self->incoming_delta); + struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_RCVD_ONE_PKT, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 10000, 100); + homa_gap_alloc(&srpc->msgin.gaps, 1000, 2000); + homa_gap_alloc(&srpc->msgin.gaps, 4000, 6000); + homa_gap_alloc(&srpc->msgin.gaps, 7000, 8000); +#ifndef __STRIP__ /* See strip.py */ + srpc->msgin.granted = srpc->msgin.recv_end; + self->homa.num_priorities = 8; +#endif /* See strip.py */ unit_log_clear(); - mock_copy_to_user_dont_copy = -1; - EXPECT_EQ(0, -homa_copy_to_user(crpc)); - EXPECT_STREQ("skb_copy_datagram_iter: 1400 bytes to 0x1000000: 0-1399; " - "skb_copy_datagram_iter: 648 bytes to 0x1000578: " - "101000-101647; " - "skb_copy_datagram_iter: 752 bytes to 0x1000800: " - "101648-102399; " - "skb_copy_datagram_iter: 1200 bytes to 0x1000af0: " - "201800-202999", + + homa_request_retrans(srpc); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_STREQ("xmit RESEND 1000-1999@7; " + "xmit RESEND 4000-5999@7; " + "xmit RESEND 7000-7999@7", + unit_log_get()); +#else /* See strip.py */ + EXPECT_STREQ("xmit RESEND 1000-1999; " + "xmit RESEND 4000-5999; " + "xmit RESEND 7000-7999; " + "xmit RESEND 1400-9999", unit_log_get()); - EXPECT_EQ(crpc->msgin.total_length, crpc->msgin.copied_out); - EXPECT_EQ(NULL, skb_peek(&crpc->msgin.packets)); - EXPECT_EQ(0, crpc->msgin.num_skbs); +#endif /* See strip.py */ } -TEST_F(homa_incoming, homa_copy_to_user__gap_in_packets) +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_incoming, homa_request_retrans__no_granted_but_not_received_data) { - struct homa_rpc *crpc; - - mock_bpage_size = 2048; - mock_bpage_shift = 11; - EXPECT_EQ(0, -homa_pool_init(&self->hsk.buffer_pool, &self->homa, - (void *) 0x1000000, 100*HOMA_BPAGE_SIZE)); - crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->server_port, self->client_id, - 1000, 4000); - ASSERT_NE(NULL, crpc); - self->data.message_length = htonl(4000); - self->data.seg.offset = htonl(2000); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, - 1400, 101000), crpc, NULL, &self->incoming_delta); + struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_RCVD_ONE_PKT, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 10000, 100); + EXPECT_EQ(1400, srpc->msgin.recv_end); unit_log_clear(); - mock_copy_to_user_dont_copy = -1; - EXPECT_EQ(0, -homa_copy_to_user(crpc)); - EXPECT_STREQ("skb_copy_datagram_iter: 1400 bytes to 0x1000000: 0-1399", - unit_log_get()); - EXPECT_EQ(1400, crpc->msgin.copied_out); - EXPECT_EQ(1, crpc->msgin.num_skbs); + + srpc->msgin.granted = 1400; + homa_request_retrans(srpc); + EXPECT_STREQ("", unit_log_get()); } -TEST_F(homa_incoming, homa_copy_to_user__no_buffer_pool_available) +TEST_F(homa_incoming, homa_request_retrans__granted_data_after_last_gap) { - struct homa_rpc *crpc; + struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_RCVD_ONE_PKT, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 10000, 100); - crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->server_port, self->client_id, - 1000, 4000); - ASSERT_NE(NULL, crpc); - EXPECT_EQ(12, -homa_copy_to_user(crpc)); + EXPECT_EQ(1400, srpc->msgin.recv_end); + unit_log_clear(); + + srpc->msgin.granted = 3000; + homa_request_retrans(srpc); + EXPECT_STREQ("xmit RESEND 1400-2999@0", unit_log_get()); } -TEST_F(homa_incoming, homa_copy_to_user__error_in_copy_to_user) +#endif /* See strip.py */ +TEST_F(homa_incoming, homa_request_retrans__no_data_received_yet) { - struct homa_rpc *crpc; - - mock_bpage_size = 2048; - mock_bpage_shift = 11; - EXPECT_EQ(0, -homa_pool_init(&self->hsk.buffer_pool, &self->homa, - (void *) 0x1000000, 100*HOMA_BPAGE_SIZE)); - crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->server_port, self->client_id, - 1000, 4000); - ASSERT_NE(NULL, crpc); - self->data.message_length = htonl(4000); - self->data.seg.offset = htonl(1400); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, - 1400, 101000), crpc, NULL, &self->incoming_delta); - self->data.seg.offset = htonl(2800); - self->data.seg.segment_length = htonl(1200); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, - 1400, 101000), crpc, NULL, &self->incoming_delta); + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, 99, 1000, 10000); + EXPECT_EQ(-1, crpc->msgin.length); unit_log_clear(); - mock_copy_data_errors = 2; - EXPECT_EQ(14, -homa_copy_to_user(crpc)); - EXPECT_STREQ("skb_copy_datagram_iter: 1400 bytes to 0x1000000: 0-1399", - unit_log_get()); - EXPECT_EQ(0, crpc->msgin.num_skbs); + + homa_request_retrans(crpc); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_STREQ("xmit RESEND 0--2@0", unit_log_get()); +#else /* See strip.py */ + EXPECT_STREQ("xmit RESEND 0--2", unit_log_get()); +#endif /* See strip.py */ } -TEST_F(homa_incoming, homa_copy_to_user__many_chunks_for_one_skb) + +TEST_F(homa_incoming, homa_add_packet__basics) { - struct homa_rpc *crpc; + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, 99, 1000, 1000); - mock_bpage_size = 512; - mock_bpage_shift = 9; - EXPECT_EQ(0, -homa_pool_init(&self->hsk.buffer_pool, &self->homa, - (void *) 0x1000000, 100*HOMA_BPAGE_SIZE)); - crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, - self->server_ip, self->server_port, self->client_id, - 1000, 4000); - ASSERT_NE(NULL, crpc); - self->data.message_length = htonl(4000); - self->data.seg.segment_length = htonl(3000); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, - 3000, 101000), crpc, NULL, &self->incoming_delta); + homa_message_in_init(crpc, 10000, 0); + unit_log_clear(); + mock_clock = 5000; + self->data.seg.offset = htonl(1400); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 1400)); + + self->data.seg.offset = htonl(4200); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 800, 4200)); + EXPECT_STREQ("start 0, end 1400, time 5000; start 2800, end 4200, time 5000", + unit_print_gaps(crpc)); unit_log_clear(); - mock_copy_to_user_dont_copy = -1; - EXPECT_EQ(0, -homa_copy_to_user(crpc)); - EXPECT_STREQ("skb_copy_datagram_iter: 512 bytes to 0x1000000: " - "101000-101511; " - "skb_copy_datagram_iter: 512 bytes to 0x1000200: " - "101512-102023; " - "skb_copy_datagram_iter: 512 bytes to 0x1000400: " - "102024-102535; " - "skb_copy_datagram_iter: 512 bytes to 0x1000600: " - "102536-103047; " - "skb_copy_datagram_iter: 512 bytes to 0x1000800: " - "103048-103559; " - "skb_copy_datagram_iter: 440 bytes to 0x1000a00: " - "103560-103999", + self->data.seg.offset = 0; + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 0)); + EXPECT_STREQ("start 2800, end 4200, time 5000", unit_print_gaps(crpc)); + EXPECT_EQ(6400, crpc->msgin.bytes_remaining); + + unit_log_clear(); + self->data.seg.offset = htonl(2800); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 2800)); + EXPECT_STREQ("", unit_print_gaps(crpc)); + unit_log_clear(); + unit_log_skb_list(&crpc->msgin.packets, 0); + EXPECT_STREQ("DATA 1400@1400; DATA 800@4200; DATA 1400@0; DATA 1400@2800", unit_log_get()); - EXPECT_EQ(3000, crpc->msgin.copied_out); - EXPECT_EQ(0, crpc->msgin.num_skbs); + EXPECT_EQ(4, skb_queue_len(&crpc->msgin.packets)); } - -TEST_F(homa_incoming, homa_get_resend_range__uninitialized_rpc) +TEST_F(homa_incoming, homa_add_packet__packet_overlaps_message_end) { - struct homa_message_in msgin; - struct resend_header resend; + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, 99, 1000, 1000); - msgin.total_length = -1; - homa_get_resend_range(&msgin, &resend); - EXPECT_EQ(0, resend.offset); - EXPECT_EQ(100, ntohl(resend.length)); + homa_message_in_init(crpc, 10000, 0); + unit_log_clear(); + self->data.seg.offset = htonl(9000); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 1400)); + EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); } -TEST_F(homa_incoming, homa_get_resend_range__empty_range) +TEST_F(homa_incoming, homa_add_packet__sequential_packets) { - struct resend_header resend; - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 5000, 5000); + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, 99, 1000, 1000); - ASSERT_NE(NULL, srpc); - homa_get_resend_range(&srpc->msgin, &resend); - EXPECT_EQ(0, resend.offset); - EXPECT_EQ(0, ntohl(resend.length)); + homa_message_in_init(crpc, 10000, 0); + unit_log_clear(); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 0)); + + self->data.seg.offset = htonl(1400); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 1400)); + + self->data.seg.offset = htonl(2800); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 2800)); + EXPECT_STREQ("", unit_print_gaps(crpc)); + EXPECT_EQ(4200, crpc->msgin.recv_end); + EXPECT_EQ(3, skb_queue_len(&crpc->msgin.packets)); } -TEST_F(homa_incoming, homa_get_resend_range__various_gaps) +TEST_F(homa_incoming, homa_add_packet__new_gap) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); - homa_message_in_init(&crpc->msgin, 10000, 0); - crpc->msgin.incoming = 10000; - struct resend_header resend; - homa_add_packet(crpc, mock_skb_new(self->client_ip, - &self->data.common, 1400, 1400)); - homa_get_resend_range(&crpc->msgin, &resend); - EXPECT_EQ(1400, ntohl(resend.offset)); - EXPECT_EQ(8600, ntohl(resend.length)); + homa_message_in_init(crpc, 10000, 0); + unit_log_clear(); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 0)); - self->data.seg.offset = htonl(8600); - homa_add_packet(crpc, mock_skb_new(self->client_ip, - &self->data.common, 1400, 8600)); - homa_get_resend_range(&crpc->msgin, &resend); - EXPECT_EQ(1400, ntohl(resend.offset)); - EXPECT_EQ(7200, ntohl(resend.length)); + self->data.seg.offset = htonl(4200); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 4200)); + EXPECT_STREQ("start 1400, end 4200", unit_print_gaps(crpc)); + EXPECT_EQ(5600, crpc->msgin.recv_end); + EXPECT_EQ(2, skb_queue_len(&crpc->msgin.packets)); +} +TEST_F(homa_incoming, homa_add_packet__no_memory_for_new_gap) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, 99, 1000, 1000); - self->data.seg.offset = htonl(6000); - homa_add_packet(crpc, mock_skb_new(self->client_ip, - &self->data.common, 1400, 6000)); - homa_get_resend_range(&crpc->msgin, &resend); - EXPECT_EQ(1400, ntohl(resend.offset)); - EXPECT_EQ(4600, ntohl(resend.length)); + homa_message_in_init(crpc, 10000, 0); + unit_log_clear(); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 0)); - self->data.seg.offset = htonl(4600); - homa_add_packet(crpc, mock_skb_new(self->client_ip, - &self->data.common, 1400, 4600)); - homa_get_resend_range(&crpc->msgin, &resend); - EXPECT_EQ(1400, ntohl(resend.offset)); - EXPECT_EQ(3200, ntohl(resend.length)); + self->data.seg.offset = htonl(4200); + mock_kmalloc_errors = 1; + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 4200)); + EXPECT_STREQ("", unit_print_gaps(crpc)); + EXPECT_EQ(1400, crpc->msgin.recv_end); + EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); } -TEST_F(homa_incoming, homa_get_resend_range__received_past_granted) +TEST_F(homa_incoming, homa_add_packet__packet_before_gap) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); - homa_message_in_init(&crpc->msgin, 10000, 0); - struct resend_header resend; - self->data.message_length = htonl(2500); + homa_message_in_init(crpc, 10000, 0); + unit_log_clear(); self->data.seg.offset = htonl(0); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); - self->data.seg.offset = htonl(1500); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + + self->data.seg.offset = htonl(4200); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 4200)); + EXPECT_STREQ("start 1400, end 4200", unit_print_gaps(crpc)); + + self->data.seg.offset = htonl(0); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); - self->data.seg.offset = htonl(2900); - self->data.seg.segment_length = htonl(1100); - homa_add_packet(crpc, mock_skb_new(self->client_ip, - &self->data.common, 1100, 0)); - crpc->msgin.incoming = 2000; - homa_get_resend_range(&crpc->msgin, &resend); - EXPECT_EQ(1400, ntohl(resend.offset)); - EXPECT_EQ(100, ntohl(resend.length)); + EXPECT_EQ(2, skb_queue_len(&crpc->msgin.packets)); } -TEST_F(homa_incoming, homa_get_resend_range__gap_at_beginning) +TEST_F(homa_incoming, homa_add_packet__packet_straddles_start_of_gap) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); - homa_message_in_init(&crpc->msgin, 10000, 0); - struct resend_header resend; - self->data.seg.offset = htonl(6200); - homa_add_packet(crpc, mock_skb_new(self->client_ip, - &self->data.common, 1400, 6200)); - homa_get_resend_range(&crpc->msgin, &resend); - EXPECT_EQ(0, ntohl(resend.offset)); - EXPECT_EQ(6200, ntohl(resend.length)); + homa_message_in_init(crpc, 10000, 0); + unit_log_clear(); + self->data.seg.offset = htonl(0); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 0)); + + self->data.seg.offset = htonl(4200); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 4200)); + EXPECT_STREQ("start 1400, end 4200", unit_print_gaps(crpc)); + + self->data.seg.offset = htonl(1000); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 1000)); + EXPECT_EQ(2, skb_queue_len(&crpc->msgin.packets)); } -TEST_F(homa_incoming, homa_get_resend_range__gap_starts_just_after_copied_out) +TEST_F(homa_incoming, homa_add_packet__packet_extends_past_gap) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); - homa_message_in_init(&crpc->msgin, 10000, 0); - struct resend_header resend; - self->data.seg.offset = htonl(5000); - homa_add_packet(crpc, mock_skb_new(self->client_ip, - &self->data.common, 1400, 6200)); - crpc->msgin.bytes_remaining = 6600; - crpc->msgin.incoming = 7000; - crpc->msgin.copied_out = 2000; - homa_get_resend_range(&crpc->msgin, &resend); - EXPECT_EQ(2000, ntohl(resend.offset)); - EXPECT_EQ(3000, ntohl(resend.length)); -} + homa_message_in_init(crpc, 10000, 0); + unit_log_clear(); + self->data.seg.offset = htonl(0); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 0)); -TEST_F(homa_incoming, homa_pkt_dispatch__handle_ack) -{ - struct homa_sock hsk; - mock_sock_init(&hsk, &self->homa, self->server_port); - //mock_sock_init(&self->hsk, &self->homa, 0); - struct homa_rpc *srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 3000); - ASSERT_NE(NULL, srpc); - self->data.seg.ack = (struct homa_ack) { - .client_port = htons(self->client_port), - .server_port = htons(self->server_port), - .client_id = cpu_to_be64(self->client_id)}; - self->data.common.sender_id = cpu_to_be64(self->client_id+10); - homa_pkt_dispatch(mock_skb_new(self->client_ip, &self->data.common, - 1400, 0), &self->hsk, &self->lcache, - &self->incoming_delta); - EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); - homa_sock_shutdown(&hsk); -} -TEST_F(homa_incoming, homa_pkt_dispatch__new_server_rpc) -{ - homa_pkt_dispatch(mock_skb_new(self->client_ip, &self->data.common, - 1400, 0), &self->hsk, &self->lcache, - &self->incoming_delta); - EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(1, mock_skb_count()); + self->data.seg.offset = htonl(2000); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 2000)); + EXPECT_STREQ("start 1400, end 2000", unit_print_gaps(crpc)); + + self->data.seg.offset = htonl(1400); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 1400)); + EXPECT_EQ(2, skb_queue_len(&crpc->msgin.packets)); } -TEST_F(homa_incoming, homa_pkt_dispatch__existing_server_rpc) +TEST_F(homa_incoming, homa_add_packet__packet_at_start_of_gap) { - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 10000, 100); - ASSERT_NE(NULL, srpc); - EXPECT_EQ(8600, srpc->msgin.bytes_remaining); + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, 99, 1000, 1000); + + homa_message_in_init(crpc, 10000, 0); + unit_log_clear(); + self->data.seg.offset = htonl(0); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 0)); + + self->data.seg.offset = htonl(4200); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 4200)); + EXPECT_STREQ("start 1400, end 4200", unit_print_gaps(crpc)); + self->data.seg.offset = htonl(1400); - self->data.common.sender_id = cpu_to_be64(self->client_id); - homa_pkt_dispatch(mock_skb_new(self->client_ip, &self->data.common, - 1400, 0), &self->hsk, &self->lcache, - &self->incoming_delta); - EXPECT_EQ(7200, srpc->msgin.bytes_remaining); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 1400)); + EXPECT_EQ(3, skb_queue_len(&crpc->msgin.packets)); + unit_log_clear(); + EXPECT_STREQ("start 2800, end 4200", unit_print_gaps(crpc)); } -TEST_F(homa_incoming, homa_pkt_dispatch__cant_create_rpc) +TEST_F(homa_incoming, homa_add_packet__packet_covers_entire_gap) { - mock_kmalloc_errors = 1; - homa_pkt_dispatch(mock_skb_new(self->client_ip, &self->data.common, - 1400, 0), &self->hsk, &self->lcache, - &self->incoming_delta); - EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(0, mock_skb_count()); -} -TEST_F(homa_incoming, homa_pkt_dispatch__non_data_packet_for_existing_server_rpc) -{ - struct resend_header resend = {.common = { - .sport = htons(self->client_port), - .dport = htons(self->server_port), - .type = RESEND, - .sender_id = cpu_to_be64(self->client_id)}, - .offset = 0, - .length = 1000, - .priority = 3}; - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 10000, 100); - ASSERT_NE(NULL, srpc); + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, 99, 1000, 1000); + + homa_message_in_init(crpc, 10000, 0); unit_log_clear(); - homa_pkt_dispatch(mock_skb_new(self->client_ip, &resend.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_STREQ("xmit BUSY", unit_log_get()); + self->data.seg.offset = htonl(0); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 0)); + + self->data.seg.offset = htonl(2800); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 2800)); + EXPECT_STREQ("start 1400, end 2800", unit_print_gaps(crpc)); + + self->data.seg.offset = htonl(1400); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 1400)); + EXPECT_EQ(3, skb_queue_len(&crpc->msgin.packets)); + EXPECT_STREQ("", unit_print_gaps(crpc)); } -TEST_F(homa_incoming, homa_pkt_dispatch__unknown_client_rpc) +TEST_F(homa_incoming, homa_add_packet__packet_beyond_end_of_gap) { - struct resend_header h = {{.sport = htons(self->server_port), - .dport = htons(self->client_port), - .sender_id = cpu_to_be64(99991), - .type = UNKNOWN}}; - mock_xmit_log_verbose = 1; - homa_pkt_dispatch(mock_skb_new(self->client_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.unknown_rpcs); + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, 99, 1000, 1000); + + homa_message_in_init(crpc, 10000, 0); + unit_log_clear(); + self->data.seg.offset = htonl(0); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 0)); + + self->data.seg.offset = htonl(4200); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 4200)); + EXPECT_STREQ("start 1400, end 4200", unit_print_gaps(crpc)); + + self->data.seg.offset = htonl(5000); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 5000)); + EXPECT_EQ(2, skb_queue_len(&crpc->msgin.packets)); } -TEST_F(homa_incoming, homa_pkt_dispatch__unknown_server_rpc) +TEST_F(homa_incoming, homa_add_packet__packet_straddles_end_of_gap) { - struct resend_header h = {{.sport = htons(self->client_port), - .dport = htons(self->server_port), - .sender_id = cpu_to_be64(99990), - .type = UNKNOWN}}; - mock_xmit_log_verbose = 1; - homa_pkt_dispatch(mock_skb_new(self->client_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.unknown_rpcs); + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, 99, 1000, 1000); + + homa_message_in_init(crpc, 10000, 0); + unit_log_clear(); + self->data.seg.offset = htonl(0); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 0)); + + self->data.seg.offset = htonl(4200); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 4200)); + EXPECT_STREQ("start 1400, end 4200", unit_print_gaps(crpc)); + + self->data.seg.offset = htonl(4000); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 4000)); + EXPECT_EQ(2, skb_queue_len(&crpc->msgin.packets)); } -TEST_F(homa_incoming, homa_pkt_dispatch__existing_client_rpc) +TEST_F(homa_incoming, homa_add_packet__packet_at_end_of_gap) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); - EXPECT_EQ(11200, crpc->msgout.granted); + self->server_port, 99, 1000, 1000); + + homa_message_in_init(crpc, 10000, 0); unit_log_clear(); + self->data.seg.offset = htonl(0); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 0)); - struct grant_header h = {{.sport = htons(self->server_port), - .dport = htons(self->client_port), - .sender_id = cpu_to_be64(self->server_id), - .type = GRANT}, - .offset = htonl(12600), - .priority = 3}; - homa_pkt_dispatch(mock_skb_new(self->server_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_EQ(12600, crpc->msgout.granted); + self->data.seg.offset = htonl(4200); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 4200)); + EXPECT_STREQ("start 1400, end 4200", unit_print_gaps(crpc)); + + self->data.seg.offset = htonl(2800); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 2800)); + EXPECT_EQ(3, skb_queue_len(&crpc->msgin.packets)); + EXPECT_STREQ("start 1400, end 2800", unit_print_gaps(crpc)); } -TEST_F(homa_incoming, homa_pkt_dispatch__lcached_client_rpc) +TEST_F(homa_incoming, homa_add_packet__packet_in_middle_of_gap) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); - EXPECT_EQ(11200, crpc->msgout.granted); + self->server_port, 99, 1000, 1000); + + homa_message_in_init(crpc, 10000, 0); unit_log_clear(); + mock_clock = 1000; + self->data.seg.offset = htonl(0); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 0)); - struct grant_header h = {{.sport = htons(self->server_port), - .dport = htons(self->client_port), - .sender_id = cpu_to_be64(self->server_id), - .type = GRANT}, - .offset = htonl(12600), - .priority = 3}; - homa_pkt_dispatch(mock_skb_new(self->server_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_EQ(12600, crpc->msgout.granted); - h.offset = htonl(14000); - homa_pkt_dispatch(mock_skb_new(self->server_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_EQ(14000, crpc->msgout.granted); + self->data.seg.offset = htonl(4200); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 4200)); + EXPECT_STREQ("start 1400, end 4200, time 1000", + unit_print_gaps(crpc)); + + self->data.seg.offset = htonl(2000); + mock_clock = 2000; + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 2000)); + EXPECT_EQ(3, skb_queue_len(&crpc->msgin.packets)); + EXPECT_STREQ("start 1400, end 2000, time 1000; start 3400, end 4200, time 1000", + unit_print_gaps(crpc)); } -TEST_F(homa_incoming, homa_pkt_dispatch__cutoffs_for_unknown_client_rpc) +TEST_F(homa_incoming, homa_add_packet__kmalloc_failure_while_splitting_gap) { - struct homa_peer *peer; - struct cutoffs_header h = {{.sport = htons(self->server_port), - .dport = htons(self->client_port), - .sender_id = cpu_to_be64(99991), - .type = CUTOFFS}, - .unsched_cutoffs = {htonl(10), htonl(9), htonl(8), - htonl(7), htonl(6), htonl(5), htonl(4), - htonl(3)}, - .cutoff_version = 400}; - homa_pkt_dispatch(mock_skb_new(self->server_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - peer = homa_peer_find(&self->homa.peers, self->server_ip, - &self->hsk.inet); - ASSERT_FALSE(IS_ERR(peer)); - EXPECT_EQ(400, peer->cutoff_version); - EXPECT_EQ(9, peer->unsched_cutoffs[1]); - EXPECT_EQ(3, peer->unsched_cutoffs[7]); + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, 99, 1000, 1000); + + homa_message_in_init(crpc, 10000, 0); + unit_log_clear(); + mock_clock = 1000; + self->data.seg.offset = htonl(0); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 0)); + + self->data.seg.offset = htonl(4200); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 4200)); + EXPECT_STREQ("start 1400, end 4200, time 1000", + unit_print_gaps(crpc)); + + self->data.seg.offset = htonl(2000); + mock_clock = 2000; + mock_kmalloc_errors = 1; + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 2000)); + EXPECT_EQ(2, skb_queue_len(&crpc->msgin.packets)); + EXPECT_STREQ("start 1400, end 4200, time 1000", unit_print_gaps(crpc)); } -TEST_F(homa_incoming, homa_pkt_dispatch__resend_for_unknown_server_rpc) +TEST_F(homa_incoming, homa_add_packet__scan_multiple_gaps) { - struct resend_header h = {{.sport = htons(self->client_port), - .dport = htons(self->server_port), - .sender_id = cpu_to_be64(99990), - .type = RESEND}, - .offset = 0, .length = 2000, .priority = 5}; - homa_pkt_dispatch(mock_skb_new(self->client_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_STREQ("xmit UNKNOWN", unit_log_get()); + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, 99, 1000, 1000); + + homa_message_in_init(crpc, 10000, 0); + unit_log_clear(); + self->data.seg.offset = htonl(1400); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 0)); + + self->data.seg.offset = htonl(4200); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 4200)); + EXPECT_STREQ("start 0, end 1400; start 2800, end 4200", + unit_print_gaps(crpc)); + + self->data.seg.offset = htonl(2800); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 2800)); + EXPECT_EQ(3, skb_queue_len(&crpc->msgin.packets)); + EXPECT_STREQ("start 0, end 1400", unit_print_gaps(crpc)); } -TEST_F(homa_incoming, homa_pkt_dispatch__reset_counters) +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_incoming, homa_add_packet__discard_metrics) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); + self->server_port, 99, 1000, 1000); + + homa_message_in_init(crpc, 10000, 0); + crpc->msgin.recv_end = 4200; + self->data.seg.offset = htonl(0); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 0)); + EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); + EXPECT_EQ(0, homa_metrics_per_cpu()->resent_discards); + EXPECT_EQ(1, homa_metrics_per_cpu()->packet_discards); + + self->data.retransmit = 1; + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 0)); + EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); + EXPECT_EQ(1, homa_metrics_per_cpu()->resent_discards); + EXPECT_EQ(1, homa_metrics_per_cpu()->packet_discards); + + self->data.seg.offset = htonl(4200); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 4200)); + EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); + EXPECT_EQ(1, homa_metrics_per_cpu()->resent_packets_used); +} +TEST_F(homa_incoming, homa_add_packet__client_rpc_metrics) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 1000, 2000); + + homa_message_in_init(crpc, 2000, 0); + + /* First packet doesn't complete message. */ + self->data.seg.offset = htonl(0); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 0)); + EXPECT_EQ(1400, homa_metrics_per_cpu()->client_response_bytes_done); + EXPECT_EQ(0, homa_metrics_per_cpu()->client_responses_done); + + /* Second packet completes message. */ + self->data.seg.offset = htonl(1400); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 600, 0)); + EXPECT_EQ(2000, homa_metrics_per_cpu()->client_response_bytes_done); + EXPECT_EQ(1, homa_metrics_per_cpu()->client_responses_done); +} +TEST_F(homa_incoming, homa_add_packet__server_rpc_metrics) +{ + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, self->client_port, + self->server_id, 2000, 10000); + EXPECT_EQ(1400, homa_metrics_per_cpu()->server_request_bytes_done); + EXPECT_EQ(0, homa_metrics_per_cpu()->server_requests_done); + + /* Second packet completes message. */ + self->data.seg.offset = htonl(1400); + homa_add_packet(srpc, mock_skb_alloc(self->server_ip, + &self->data.common, 600, 0)); + EXPECT_EQ(2000, homa_metrics_per_cpu()->server_request_bytes_done); + EXPECT_EQ(1, homa_metrics_per_cpu()->server_requests_done); +} +#endif /* See strip.py */ + +TEST_F(homa_incoming, homa_copy_to_user__basics) +{ + struct homa_rpc *crpc; + + mock_bpage_size = 2048; + mock_bpage_shift = 11; + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 1000, 4000); ASSERT_NE(NULL, crpc); - EXPECT_EQ(11200, crpc->msgout.granted); + self->data.message_length = htonl(4000); + self->data.seg.offset = htonl(1400); + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, + 1400, 101000), crpc); + self->data.seg.offset = htonl(2800); + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, + 1200, 201800), crpc); + EXPECT_NE(0, test_bit(RPC_PKTS_READY, &crpc->flags)); + unit_log_clear(); - crpc->silent_ticks = 5; - crpc->peer->outstanding_resends = 2; + mock_copy_to_user_dont_copy = -1; + homa_rpc_lock(crpc); + EXPECT_EQ(0, -homa_copy_to_user(crpc)); + homa_rpc_unlock(crpc); + EXPECT_STREQ("skb_copy_datagram_iter: 1400 bytes to 0x1000000: 0-1399; " + "skb_copy_datagram_iter: 648 bytes to 0x1000578: 101000-101647; " + "skb_copy_datagram_iter: 752 bytes to 0x1000800: 101648-102399; " + "skb_copy_datagram_iter: 1200 bytes to 0x1000af0: 201800-202999", + unit_log_get()); + EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); + EXPECT_EQ(0, test_bit(RPC_PKTS_READY, &crpc->flags)); +} +TEST_F(homa_incoming, homa_copy_to_user__rpc_freed) +{ + struct homa_rpc *crpc; - struct grant_header h = {.common = {.sport = htons(self->server_port), - .dport = htons(self->client_port), - .sender_id = cpu_to_be64(self->server_id), - .type = GRANT}, - .offset = htonl(12600), .priority = 3}; - homa_pkt_dispatch(mock_skb_new(self->server_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_EQ(0, crpc->silent_ticks); - EXPECT_EQ(0, crpc->peer->outstanding_resends); + mock_bpage_size = 2048; + mock_bpage_shift = 11; + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 1000, 4000); + ASSERT_NE(NULL, crpc); + homa_rpc_end(crpc); - /* Don't reset silent_ticks for some packet types. */ - h.common.type = NEED_ACK; - crpc->silent_ticks = 5; - crpc->peer->outstanding_resends = 2; - homa_pkt_dispatch(mock_skb_new(self->server_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_EQ(5, crpc->silent_ticks); - EXPECT_EQ(0, crpc->peer->outstanding_resends); + unit_log_clear(); + mock_copy_to_user_dont_copy = -1; + EXPECT_EQ(EINVAL, -homa_copy_to_user(crpc)); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); } -TEST_F(homa_incoming, homa_pkt_dispatch__forced_reap) +TEST_F(homa_incoming, homa_copy_to_user__multiple_batches) { - struct homa_rpc *dead = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 20000); - homa_rpc_free(dead); - EXPECT_EQ(30, self->hsk.dead_skbs); - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 10000, 5000); - ASSERT_NE(NULL, srpc); - self->homa.dead_buffs_limit = 16; - mock_cycles = ~0; + struct homa_rpc *crpc; + int offset; - /* First packet: below the threshold for reaps. */ - homa_pkt_dispatch(mock_skb_new(self->client_ip, &self->data.common, - 1400, 0), &self->hsk, &self->lcache, - &self->incoming_delta); - EXPECT_EQ(30, self->hsk.dead_skbs); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.data_pkt_reap_cycles); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 1000, 20000); + ASSERT_NE(NULL, crpc); + self->data.message_length = htonl(20000); + for (offset = 1400; offset < 1400*8; offset += 1400) { + self->data.seg.offset = htonl(offset); + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, + 1400, offset), crpc); + } + EXPECT_EQ(8, skb_queue_len(&crpc->msgin.packets)); - /* Second packet: must reap. */ - self->homa.dead_buffs_limit = 15; - self->homa.reap_limit = 10; - homa_pkt_dispatch(mock_skb_new(self->client_ip, &self->data.common, - 1400, 0), &self->hsk, &self->lcache, - &self->incoming_delta); - EXPECT_EQ(20, self->hsk.dead_skbs); - EXPECT_NE(0, homa_cores[cpu_number]->metrics.data_pkt_reap_cycles); + unit_log_clear(); + mock_copy_to_user_dont_copy = -1; + homa_rpc_lock(crpc); + EXPECT_EQ(0, -homa_copy_to_user(crpc)); + homa_rpc_unlock(crpc); + EXPECT_STREQ("skb_copy_datagram_iter: 1400 bytes to 0x1000000: 0-1399; " + "skb_copy_datagram_iter: 1400 bytes to 0x1000578: 1400-2799; " + "skb_copy_datagram_iter: 1400 bytes to 0x1000af0: 2800-4199; " + "skb_copy_datagram_iter: 1400 bytes to 0x1001068: 4200-5599; " + "skb_copy_datagram_iter: 1400 bytes to 0x10015e0: 5600-6999; " + "skb_copy_datagram_iter: 1400 bytes to 0x1001b58: 7000-8399; " + "skb_copy_datagram_iter: 1400 bytes to 0x10020d0: 8400-9799; " + "skb_copy_datagram_iter: 1400 bytes to 0x1002648: 9800-11199", + unit_log_get()); + EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); } -TEST_F(homa_incoming, homa_pkt_dispatch__unknown_type) +TEST_F(homa_incoming, homa_copy_to_user__nothing_to_copy) { - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 1000, 20000); ASSERT_NE(NULL, crpc); - EXPECT_EQ(11200, crpc->msgout.granted); + EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); + + /* First call finds packets to copy. */ unit_log_clear(); + mock_copy_to_user_dont_copy = -1; + homa_rpc_lock(crpc); + EXPECT_EQ(0, -homa_copy_to_user(crpc)); + homa_rpc_unlock(crpc); + EXPECT_STREQ("skb_copy_datagram_iter: 1400 bytes to 0x1000000: 0-1399", + unit_log_get()); + EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); - struct common_header h = {.sport = htons(self->server_port), - .dport = htons(self->client_port), - .sender_id = cpu_to_be64(self->server_id), .type = 99}; - homa_pkt_dispatch(mock_skb_new(self->client_ip, &h, 0, 0), &self->hsk, - &self->lcache, &self->incoming_delta); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.unknown_packet_types); + /* Second call finds no packets. */ + unit_log_clear(); + EXPECT_EQ(0, -homa_copy_to_user(crpc)); + EXPECT_STREQ("", unit_log_get()); } -TEST_F(homa_incoming, homa_pkt_dispatch__new_server_rpc_but_socket_shutdown) +TEST_F(homa_incoming, homa_copy_to_user__many_chunks_for_one_skb) { - self->hsk.shutdown = 1; - homa_pkt_dispatch(mock_skb_new(self->client_ip, &self->data.common, - 1400, 0), &self->hsk, &self->lcache, - &self->incoming_delta); - EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); - self->hsk.shutdown = 0; -} + struct homa_rpc *crpc; -TEST_F(homa_incoming, homa_data_pkt__basics) + mock_bpage_size = 512; + mock_bpage_shift = 9; + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 1000, 4000); + ASSERT_NE(NULL, crpc); + self->data.message_length = htonl(4000); + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, + 3000, 101000), crpc); + + unit_log_clear(); + mock_copy_to_user_dont_copy = -1; + homa_rpc_lock(crpc); + EXPECT_EQ(0, -homa_copy_to_user(crpc)); + homa_rpc_unlock(crpc); + EXPECT_STREQ("skb_copy_datagram_iter: 512 bytes to 0x1000000: 101000-101511; " + "skb_copy_datagram_iter: 512 bytes to 0x1000200: 101512-102023; " + "skb_copy_datagram_iter: 512 bytes to 0x1000400: 102024-102535; " + "skb_copy_datagram_iter: 512 bytes to 0x1000600: 102536-103047; " + "skb_copy_datagram_iter: 512 bytes to 0x1000800: 103048-103559; " + "skb_copy_datagram_iter: 440 bytes to 0x1000a00: 103560-103999", + unit_log_get()); +} +TEST_F(homa_incoming, homa_copy_to_user__skb_data_extends_past_message_end) { - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 1600); + struct homa_data_hdr *h; + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 1000, 4000); ASSERT_NE(NULL, crpc); + self->data.message_length = htonl(4000); + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, + 3000, 101000), crpc); + unit_log_clear(); - crpc->msgout.next_xmit_offset = crpc->msgout.length; - self->data.message_length = htonl(1600); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, - 1400, 0), crpc, NULL, &self->incoming_delta); - EXPECT_EQ(RPC_INCOMING, crpc->state); - EXPECT_EQ(1, unit_list_length(&self->hsk.ready_responses)); - EXPECT_EQ(200, crpc->msgin.bytes_remaining); - EXPECT_EQ(1, crpc->msgin.num_skbs); - EXPECT_EQ(1600, crpc->msgin.incoming); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.responses_received); + mock_copy_to_user_dont_copy = -1; + h = (struct homa_data_hdr *)skb_peek(&crpc->msgin.packets)->data; + h->seg.offset = htonl(4000); + homa_rpc_lock(crpc); + EXPECT_EQ(0, -homa_copy_to_user(crpc)); + homa_rpc_unlock(crpc); + EXPECT_STREQ("", unit_log_get()); } -TEST_F(homa_incoming, homa_data_pkt__wrong_client_rpc_state) +TEST_F(homa_incoming, homa_copy_to_user__error_in_import_ubuf) { - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 2000); + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 1000, 4000); ASSERT_NE(NULL, crpc); - crpc->state = RPC_DEAD; - self->data.message_length = htonl(2000); - self->data.seg.offset = htonl(1400); - self->data.seg.segment_length = htonl(600); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, - 600, 1400), crpc, NULL, &self->incoming_delta); - EXPECT_EQ(600, crpc->msgin.bytes_remaining); - EXPECT_EQ(1, crpc->msgin.num_skbs); - crpc->state = RPC_INCOMING; + unit_log_clear(); + mock_import_ubuf_errors = 1; + homa_rpc_lock(crpc); + EXPECT_EQ(13, -homa_copy_to_user(crpc)); + homa_rpc_unlock(crpc); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); } -TEST_F(homa_incoming, homa_data_pkt__wrong_server_rpc_state) +TEST_F(homa_incoming, homa_copy_to_user__error_in_skb_copy_datagram_iter) { - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 1400, 5000); - ASSERT_NE(NULL, srpc); - int skbs = mock_skb_count(); - homa_data_pkt(mock_skb_new(self->client_ip, &self->data.common, - 1400, 0), srpc, NULL, &self->incoming_delta); - EXPECT_EQ(RPC_OUTGOING, srpc->state); - EXPECT_EQ(skbs, mock_skb_count()); + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 1000, 4000); + ASSERT_NE(NULL, crpc); + + unit_log_clear(); + mock_copy_data_errors = 1; + homa_rpc_lock(crpc); + EXPECT_EQ(14, -homa_copy_to_user(crpc)); + homa_rpc_unlock(crpc); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); } -TEST_F(homa_incoming, homa_data_pkt__initialize_msgin) +#ifdef HOMA_TIMETRACE_H +TEST_F(homa_incoming, homa_copy_to_user__timetrace_info) { - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 1600); + struct homa_rpc *crpc; + char traces[1000]; + int offset; + + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 1000, 20000); ASSERT_NE(NULL, crpc); - self->data.message_length = htonl(1600); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, - 1400, 0), crpc, NULL, &self->incoming_delta); - EXPECT_EQ(200, crpc->msgin.bytes_remaining); - EXPECT_EQ(1600, crpc->msgin.incoming); - EXPECT_EQ(200, self->incoming_delta); + self->data.message_length = htonl(20000); + for (offset = 4200; offset < 1400*10; offset += 1400) { + self->data.seg.offset = htonl(offset); + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, + 1400, offset), crpc); + } + EXPECT_EQ(8, skb_queue_len(&crpc->msgin.packets)); + + unit_log_clear(); + mock_copy_to_user_dont_copy = -1; + tt_init(NULL); + homa_rpc_lock(crpc); + EXPECT_EQ(0, -homa_copy_to_user(crpc)); + homa_rpc_unlock(crpc); + tt_get_messages(traces, sizeof(traces)); + EXPECT_STREQ("starting copy to user space for id 1234; " + "copied out bytes 0-1400 for id 1234; " + "copied out bytes 4200-7000 for id 1234; " + "finished freeing 3 skbs for id 1234; " + "starting copy to user space for id 1234; " + "copied out bytes 7000-11200 for id 1234; " + "finished freeing 3 skbs for id 1234; " + "starting copy to user space for id 1234; " + "copied out bytes 11200-14000 for id 1234; " + "finished freeing 2 skbs for id 1234", + traces); + tt_destroy(); } -TEST_F(homa_incoming, homa_data_pkt__update_delta) +#endif + +TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_ipv4) { - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 5000); - EXPECT_NE(NULL, crpc); + struct sk_buff *skb; + + self->data.common.dport = htons(100); + + // Make sure the test uses IPv4. + mock_ipv6 = false; + unit_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, self->hnet, 0); + + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); unit_log_clear(); + homa_dispatch_pkts(skb); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); + EXPECT_STREQ("icmp_send type 3, code 3", unit_log_get()); +} +TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_ipv6) +{ + struct sk_buff *skb; - /* Total incoming goes up on first packet (count unscheduled bytes). */ - self->data.message_length = htonl(5000); - self->data.incoming = htonl(4000); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, - 1400, 0), crpc, NULL, &self->incoming_delta); - EXPECT_EQ(2600, self->incoming_delta); + self->data.common.dport = htons(100); - /* Total incoming drops on subsequent packet. */ - self->data.seg.offset = htonl(2800); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, - 1400, 2800), crpc, NULL, &self->incoming_delta); - EXPECT_EQ(1200, self->incoming_delta); + // Make sure the test uses IPv6. + mock_ipv6 = true; + unit_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, self->hnet, 0); + + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); + unit_log_clear(); + homa_dispatch_pkts(skb); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); + EXPECT_STREQ("icmp6_send type 1, code 4", unit_log_get()); +} +TEST_F(homa_incoming, homa_dispatch_pkts__server_not_enabled) +{ + struct sk_buff *skb; + + self->data.common.dport = htons(100); + + // Make sure the test uses IPv4. + mock_ipv6 = false; + unit_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, self->hnet, 0); + self->hsk.is_server = false; + + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); + unit_log_clear(); + homa_dispatch_pkts(skb); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); + EXPECT_STREQ("icmp_send type 3, code 3", unit_log_get()); +} +TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_free_many_packets) +{ + struct sk_buff *skb, *skb2, *skb3; + + self->data.common.dport = htons(100); + + // Make sure the test uses IPv6. + mock_ipv6 = true; + unit_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, self->hnet, 0); + + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); + skb2 = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); + skb3 = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); + skb->next = skb2; + skb2->next = skb3; + unit_log_clear(); + homa_dispatch_pkts(skb); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); + EXPECT_STREQ("icmp6_send type 1, code 4", unit_log_get()); +} +TEST_F(homa_incoming, homa_dispatch_pkts__new_server_rpc) +{ + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, + 1400, 0)); + EXPECT_EQ(1, unit_list_length(&self->hsk2.active_rpcs)); + EXPECT_EQ(1, mock_skb_count()); +} +TEST_F(homa_incoming, homa_dispatch_pkts__cant_create_server_rpc) +{ + mock_kmalloc_errors = 1; + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, + 1400, 0)); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); + EXPECT_EQ(0, mock_skb_count()); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(1, homa_metrics_per_cpu()->server_cant_create_rpcs); +#endif /* See strip.py */ +} +TEST_F(homa_incoming, homa_dispatch_pkts__existing_server_rpc) +{ + struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_RCVD_ONE_PKT, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 10000, 100); + + ASSERT_NE(NULL, srpc); + EXPECT_EQ(8600, srpc->msgin.bytes_remaining); + self->data.seg.offset = htonl(1400); + self->data.common.sender_id = cpu_to_be64(self->client_id); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, + 1400, 0)); + EXPECT_EQ(7200, srpc->msgin.bytes_remaining); +} +TEST_F(homa_incoming, homa_dispatch_pkts__non_data_packet_for_existing_server_rpc) +{ + struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_IN_SERVICE, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 10000, 100); + struct homa_resend_hdr resend = {.common = { + .sport = htons(self->client_port), + .dport = htons(self->server_port), + .type = RESEND, + .sender_id = cpu_to_be64(self->client_id)}, + .offset = 0, +#ifndef __STRIP__ /* See strip.py */ + .length = 1000, + .priority = 3}; +#else /* See strip.py */ + .length = 1000}; +#endif /* See strip.py */ - /* Duplicate packet should have no effect. */ - self->data.seg.offset = htonl(2800); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, - 1400, 2800), crpc, NULL, &self->incoming_delta); - EXPECT_EQ(1200, self->incoming_delta); + ASSERT_NE(NULL, srpc); + unit_log_clear(); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &resend.common, + 0, 0)); + EXPECT_STREQ("xmit BUSY", unit_log_get()); } -TEST_F(homa_incoming, homa_data_pkt__handoff) +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_incoming, homa_dispatch_pkts__existing_client_rpc) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 3000); + self->server_port, self->client_id, 20000, 1600); + ASSERT_NE(NULL, crpc); + EXPECT_EQ(RPC_OUTGOING, crpc->state); unit_log_clear(); - crpc->msgout.next_xmit_offset = crpc->msgout.length; - - /* First packet is not first in sequence, so can't hand off. */ - self->data.message_length = htonl(3000); - self->data.seg.offset = htonl(1400); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, - 1400, 0), crpc, NULL, &self->incoming_delta); - EXPECT_EQ(0, unit_list_length(&self->hsk.ready_responses)); - EXPECT_FALSE(atomic_read(&crpc->flags) & RPC_PKTS_READY); - EXPECT_EQ(1600, crpc->msgin.bytes_remaining); - EXPECT_EQ(1, crpc->msgin.num_skbs); - /* Second packet fills the gap. */ - self->data.message_length = htonl(3000); - self->data.seg.offset = htonl(0); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, - 1400, 0), crpc, NULL, &self->incoming_delta); - EXPECT_EQ(1, unit_list_length(&self->hsk.ready_responses)); - EXPECT_TRUE(atomic_read(&crpc->flags) & RPC_PKTS_READY); + crpc->msgout.next_xmit_offset = crpc->msgout.length; + self->data.message_length = htonl(1600); + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, + 1400, 0), crpc); + EXPECT_EQ(RPC_INCOMING, crpc->state); EXPECT_EQ(200, crpc->msgin.bytes_remaining); - EXPECT_EQ(2, crpc->msgin.num_skbs); } -TEST_F(homa_incoming, homa_data_pkt__add_to_grantables) +TEST_F(homa_incoming, homa_dispatch_pkts__unknown_client_rpc) { - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100000, 1000); - ASSERT_NE(NULL, srpc); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_SUBSTR("id 1235", unit_log_get()); + struct homa_grant_hdr h = {{.sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(99991), + .type = RPC_UNKNOWN}}; + + mock_xmit_log_verbose = 1; + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); + EXPECT_EQ(1, homa_metrics_per_cpu()->unknown_rpcs); } -TEST_F(homa_incoming, homa_data_pkt__send_cutoffs) +TEST_F(homa_incoming, homa_dispatch_pkts__unknown_server_rpc) { - self->homa.cutoff_version = 2; - self->homa.unsched_cutoffs[0] = 19; - self->homa.unsched_cutoffs[1] = 18; - self->homa.unsched_cutoffs[2] = 17; - self->homa.unsched_cutoffs[3] = 16; - self->homa.unsched_cutoffs[4] = 15; - self->homa.unsched_cutoffs[5] = 14; - self->homa.unsched_cutoffs[6] = 13; - self->homa.unsched_cutoffs[7] = 12; - self->data.message_length = htonl(5000); - mock_xmit_log_verbose = 1; - homa_pkt_dispatch(mock_skb_new(self->client_ip, &self->data.common, - 1400, 0), &self->hsk, &self->lcache, - &self->incoming_delta); - EXPECT_SUBSTR("cutoffs 19 18 17 16 15 14 13 12, version 2", - unit_log_get()); + struct homa_resend_hdr h = {{.sport = htons(self->client_port), + .dport = htons(self->server_port), + .sender_id = cpu_to_be64(99990), + .type = GRANT}}; - /* Try again, but this time no comments should be sent because - * no time has elapsed since the last cutoffs were sent. - */ - unit_log_clear(); - self->homa.cutoff_version = 3; - self->data.seg.offset = 1400; - homa_pkt_dispatch(mock_skb_new(self->client_ip, &self->data.common, - 1400, 0), &self->hsk, &self->lcache, - &self->incoming_delta); - EXPECT_STREQ("", unit_log_get()); + mock_xmit_log_verbose = 1; + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); + EXPECT_EQ(0, homa_metrics_per_cpu()->unknown_rpcs); } -TEST_F(homa_incoming, homa_data_pkt__cutoffs_up_to_date) +TEST_F(homa_incoming, homa_dispatch_pkts__cutoffs_for_unknown_client_rpc) { - self->homa.cutoff_version = 123; - self->data.cutoff_version = htons(123); - homa_pkt_dispatch(mock_skb_new(self->client_ip, &self->data.common, - 1400, 0), &self->hsk, &self->lcache, - &self->incoming_delta); - EXPECT_STREQ("sk->sk_data_ready invoked", unit_log_get()); -} + struct homa_cutoffs_hdr h = {{.sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(99991), + .type = CUTOFFS}, + .unsched_cutoffs = {htonl(10), htonl(9), htonl(8), + htonl(7), htonl(6), htonl(5), htonl(4), + htonl(3)}, + .cutoff_version = 400}; + struct homa_peer *peer; -TEST_F(homa_incoming, homa_grant_pkt__basics) + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); + peer = homa_peer_get(&self->hsk, self->server_ip); + ASSERT_FALSE(IS_ERR(peer)); + EXPECT_EQ(400, peer->cutoff_version); + EXPECT_EQ(9, peer->unsched_cutoffs[1]); + EXPECT_EQ(3, peer->unsched_cutoffs[7]); + homa_peer_release(peer); +} +#endif /* See strip.py */ +TEST_F(homa_incoming, homa_dispatch_pkts__resend_for_unknown_server_rpc) { - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 20000); - ASSERT_NE(NULL, srpc); - homa_xmit_data(srpc, false); - unit_log_clear(); - - struct grant_header h = {{.sport = htons(srpc->dport), - .dport = htons(self->hsk.port), - .sender_id = cpu_to_be64(self->client_id), - .type = GRANT}, - .offset = htonl(12600), - .priority = 3}; - homa_pkt_dispatch(mock_skb_new(self->client_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_EQ(12600, srpc->msgout.granted); - EXPECT_STREQ("xmit DATA 1400@11200", unit_log_get()); - - /* Don't let grant offset go backwards. */ - h.offset = htonl(10000); - unit_log_clear(); - homa_pkt_dispatch(mock_skb_new(self->client_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_EQ(12600, srpc->msgout.granted); - EXPECT_STREQ("", unit_log_get()); - - /* Wrong state. */ - h.offset = htonl(20000); - srpc->state = RPC_INCOMING; - unit_log_clear(); - homa_pkt_dispatch(mock_skb_new(self->client_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_EQ(12600, srpc->msgout.granted); - EXPECT_STREQ("", unit_log_get()); + struct homa_resend_hdr h = {{.sport = htons(self->client_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(99990), + .type = RESEND}, +#ifndef __STRIP__ /* See strip.py */ + .offset = 0, .length = 2000, .priority = 5}; +#else /* See strip.py */ + .offset = 0, .length = 2000}; +#endif /* See strip.py */ - /* Must restore old state to avoid potential crashes. */ - srpc->state = RPC_OUTGOING; + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); + EXPECT_STREQ("xmit RPC_UNKNOWN", unit_log_get()); } -TEST_F(homa_incoming, homa_grant_pkt__grant_past_end_of_message) +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_incoming, homa_dispatch_pkts__reset_counters) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + struct homa_grant_hdr h = {.common = {.sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(self->server_id), + .type = GRANT}, + .offset = htonl(12600), .priority = 3}; + ASSERT_NE(NULL, crpc); + EXPECT_EQ(10000, crpc->msgout.granted); unit_log_clear(); + crpc->silent_ticks = 5; + crpc->peer->outstanding_resends = 2; + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); + EXPECT_EQ(0, crpc->silent_ticks); + EXPECT_EQ(0, crpc->peer->outstanding_resends); - struct grant_header h = {{.sport = htons(self->server_port), - .dport = htons(self->client_port), - .sender_id = cpu_to_be64(self->server_id), - .type = GRANT}, - .offset = htonl(25000), - .priority = 3}; - homa_pkt_dispatch(mock_skb_new(self->client_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_EQ(20000, crpc->msgout.granted); + /* Don't reset silent_ticks for some packet types. */ + h.common.type = CUTOFFS; + crpc->silent_ticks = 5; + crpc->peer->outstanding_resends = 2; + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); + EXPECT_EQ(5, crpc->silent_ticks); + EXPECT_EQ(0, crpc->peer->outstanding_resends); } - -TEST_F(homa_incoming, homa_resend_pkt__unknown_rpc) +#endif /* See strip.py */ +TEST_F(homa_incoming, homa_dispatch_pkts__dont_reset_silent_ticks_on_NEED_ACK) { - struct resend_header h = {{.sport = htons(self->client_port), - .dport = htons(self->server_port), - .sender_id = cpu_to_be64(self->client_id), - .type = RESEND}, - .offset = htonl(100), - .length = htonl(200), - .priority = 3}; + /* Note: if NEED_ACKs cause silent_ticks to get reset, can get in + * a strange state where the server sent a response and is waiting + * for an ACK, so it sends NEED_ACKs. But all the response packets + * got lost, so the client can't ack, and the NEED_ACKs reset + * silent ticks so the client doesn't issue RESENDs. + */ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 20000, 1600); + struct homa_need_ack_hdr h = {.common = { + .sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(self->server_id), + .type = NEED_ACK}}; - homa_pkt_dispatch(mock_skb_new(self->client_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_STREQ("xmit UNKNOWN", unit_log_get()); + ASSERT_NE(NULL, crpc); + unit_log_clear(); + crpc->silent_ticks = 2; + crpc->peer->outstanding_resends = 3; + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); + EXPECT_EQ(2, crpc->silent_ticks); + EXPECT_EQ(0, crpc->peer->outstanding_resends); } -TEST_F(homa_incoming, homa_resend_pkt__server_sends_busy) +TEST_F(homa_incoming, homa_dispatch_pkts__multiple_ack_packets) { - struct resend_header h = {{.sport = htons(self->client_port), - .dport = htons(self->server_port), - .sender_id = cpu_to_be64(self->client_id), - .type = RESEND}, - .offset = htonl(100), - .length = htonl(200), - .priority = 3}; - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, + struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 20000); + self->server_id, 100, 3000); + struct sk_buff *skb, *skb2, *skb3; + struct homa_ack_hdr ack; + ASSERT_NE(NULL, srpc); - unit_log_clear(); + ack.common = self->data.common; + ack.common.type = ACK; + ack.common.sender_id += 100; + ack.num_acks = htons(1); + ack.acks[0].server_port = htons(self->server_port); + ack.acks[0].client_id = cpu_to_be64(self->client_id + 4); + skb = mock_skb_alloc(self->client_ip, &ack.common, 0, 0); + skb2 = mock_skb_alloc(self->client_ip, &ack.common, 0, 0); + skb3 = mock_skb_alloc(self->client_ip, &ack.common, 0, 0); + skb->next = skb2; + skb2->next = skb3; - homa_pkt_dispatch(mock_skb_new(self->client_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_STREQ("xmit BUSY", unit_log_get()); + unit_log_clear(); + homa_dispatch_pkts(skb); + EXPECT_SUBSTR("ack 1239", unit_log_get()); } -TEST_F(homa_incoming, homa_resend_pkt__client_not_outgoing) +TEST_F(homa_incoming, homa_dispatch_pkts__unknown_type) { - /* Important to respond to resends even if client thinks the - * server must already have received everything. - */ - struct resend_header h = {{.sport = htons(self->server_port), - .dport = htons(self->client_port), - .sender_id = cpu_to_be64(self->server_id), - .type = RESEND}, - .offset = htonl(100), - .length = htonl(200), - .priority = 3}; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 2000, 3000); + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 20000, 1600); + ASSERT_NE(NULL, crpc); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(10000, crpc->msgout.granted); +#endif /* See strip.py */ unit_log_clear(); - homa_pkt_dispatch(mock_skb_new(self->server_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_STREQ("xmit DATA retrans 1400@0", unit_log_get()); + struct homa_common_hdr h = {.sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(self->server_id), .type = 99}; + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h, 0, 0)); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(1, homa_metrics_per_cpu()->unknown_packet_types); +#endif /* See strip.py */ } -TEST_F(homa_incoming, homa_resend_pkt__send_busy_instead_of_data) +TEST_F(homa_incoming, homa_dispatch_pkts__handle_ack) { - struct resend_header h = {{.sport = htons(self->server_port), - .dport = htons(self->client_port), - .sender_id = cpu_to_be64(self->server_id), - .type = RESEND}, - .offset = htonl(100), - .length = htonl(200), - .priority = 3}; - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 2000, 100); - ASSERT_NE(NULL, crpc); + struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 100, 3000); + + ASSERT_NE(NULL, srpc); + self->data.ack = (struct homa_ack) { + .server_port = htons(self->server_port), + .client_id = cpu_to_be64(self->client_id)}; + self->data.common.sender_id = cpu_to_be64(self->client_id+10); + unit_log_clear(); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, + 1400, 0)); + EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); + EXPECT_SUBSTR("ack 1235", unit_log_get()); +} +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_incoming, homa_dispatch_pkts__invoke_homa_grant_check_rpc) +{ + self->data.incoming = htonl(1000); + self->data.message_length = htonl(20000); + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &self->data.common, + 0, 0)); unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_SUBSTR("id 1235", unit_log_get()); +} +#endif /* See strip.py */ +TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) +{ + struct homa_rpc *dead = unit_client_rpc(&self->hsk, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, + self->server_port, self->client_id, 20000, 20000); + struct homa_rpc *srpc; + int dead_skbs; + + mock_clock_tick = 10; + homa_rpc_end(dead); + dead_skbs = self->hsk.dead_skbs; + EXPECT_TRUE(dead_skbs >= 30); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 10000, 5000); + ASSERT_NE(NULL, srpc); + self->homa.dead_buffs_limit = 16; - homa_pkt_dispatch(mock_skb_new(self->server_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_STREQ("xmit BUSY", unit_log_get()); + /* First packet: criteria for reaps not met. */ + self->data.common.dport = htons(self->hsk.port); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, + 1400, 0)); + EXPECT_EQ(dead_skbs, self->hsk.dead_skbs); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(0, homa_metrics_per_cpu()->data_pkt_reap_cycles); +#endif /* See strip.py */ + + /* Second packet: must reap because of dead_buffs_limit (should only + * reaps a few skbs). + */ + self->homa.dead_buffs_limit = 15; + self->homa.reap_limit = 5; + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, + 1400, 0)); + EXPECT_EQ(dead_skbs - 5, self->hsk.dead_skbs); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_NE(0, homa_metrics_per_cpu()->data_pkt_reap_cycles); +#endif /* See strip.py */ + + /* Third packet: must reap all dead skbs (SOCK_NO_SPACE). */ + set_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, + 1400, 0)); + EXPECT_EQ(0, self->hsk.dead_skbs); } -TEST_F(homa_incoming, homa_resend_pkt__client_send_data) + +TEST_F(homa_incoming, homa_data_pkt__basics) { - struct resend_header h = {{.sport = htons(self->server_port), - .dport = htons(self->client_port), - .sender_id = cpu_to_be64(self->server_id), - .type = RESEND}, - .offset = htonl(100), - .length = htonl(200), - .priority = 3}; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 2000, 100); + self->server_port, self->client_id, 1000, 1600); + ASSERT_NE(NULL, crpc); - homa_xmit_data(crpc, false); unit_log_clear(); - mock_clear_xmit_prios(); + crpc->msgout.next_xmit_offset = crpc->msgout.length; + self->data.message_length = htonl(1600); + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, + 1400, 0), crpc); + EXPECT_EQ(RPC_INCOMING, crpc->state); + EXPECT_EQ(1, unit_list_length(&self->hsk.ready_rpcs)); + EXPECT_EQ(200, crpc->msgin.bytes_remaining); + EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(1600, crpc->msgin.granted); + EXPECT_EQ(1, homa_metrics_per_cpu()->responses_received); +#endif /* See strip.py */ +} +TEST_F(homa_incoming, homa_data_pkt__handle_ack) +{ + struct homa_rpc *srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 100, 3000); + struct homa_rpc *srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, + self->client_ip, self->server_ip, self->client_port, + self->server_id+2, 10000, 1000); - homa_pkt_dispatch(mock_skb_new(self->server_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_STREQ("xmit DATA retrans 1400@0", unit_log_get()); - EXPECT_STREQ("3", mock_xmit_prios); + ASSERT_NE(NULL, srpc1); + ASSERT_NE(NULL, srpc2); + EXPECT_EQ(8600, srpc2->msgin.bytes_remaining); + + self->data.ack = (struct homa_ack) { + .server_port = htons(self->hsk.port), + .client_id = cpu_to_be64(self->client_id)}; + self->data.common.sender_id = cpu_to_be64(self->client_id+2); + self->data.seg.offset = htonl(1400); + unit_log_clear(); + homa_rpc_lock(srpc2); + homa_data_pkt(mock_skb_alloc(self->client_ip, &self->data.common, + 1400, 0), srpc2); + homa_rpc_unlock(srpc2); + EXPECT_EQ(RPC_DEAD, srpc1->state); + EXPECT_SUBSTR("ack 1235; homa_rpc_end invoked", unit_log_get()); + EXPECT_EQ(7200, srpc2->msgin.bytes_remaining); } -TEST_F(homa_incoming, homa_resend_pkt__server_send_data) +TEST_F(homa_incoming, homa_data_pkt__handle_ack_rpc_now_dead) { - struct resend_header h = {{.sport = htons(self->client_port), - .dport = htons(self->server_port), - .sender_id = cpu_to_be64(self->client_id), - .type = RESEND}, - .offset = htonl(100), - .length = htonl(2000), - .priority = 4}; - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, + struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 20000); + self->server_id, 10000, 1000); + ASSERT_NE(NULL, srpc); - homa_xmit_data(srpc, false); - unit_log_clear(); - mock_clear_xmit_prios(); + EXPECT_EQ(8600, srpc->msgin.bytes_remaining); - homa_pkt_dispatch(mock_skb_new(self->client_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_STREQ("xmit DATA retrans 1400@0; " - "xmit DATA retrans 1400@1400", unit_log_get()); - EXPECT_STREQ("4 4", mock_xmit_prios); + /* This is a bit contrived, but the ack terminates the RPC for which + * the data packet was intended. + */ + self->data.ack = (struct homa_ack) { + .server_port = htons(self->hsk.port), + .client_id = cpu_to_be64(self->client_id)}; + self->data.common.sender_id = cpu_to_be64(self->client_id); + self->data.seg.offset = htonl(1400); + unit_log_clear(); + homa_rpc_lock(srpc); + homa_data_pkt(mock_skb_alloc(self->client_ip, &self->data.common, + 1400, 0), srpc); + homa_rpc_unlock(srpc); + EXPECT_EQ(RPC_DEAD, srpc->state); + EXPECT_SUBSTR("ack 1235; " + "homa_rpc_end invoked; " + "homa_data_pkt discarded packet", unit_log_get()); + EXPECT_EQ(8600, srpc->msgin.bytes_remaining); } +TEST_F(homa_incoming, homa_data_pkt__wrong_client_rpc_state) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 1000, 2000); -TEST_F(homa_incoming, homa_unknown_pkt__client_resend_all) + ASSERT_NE(NULL, crpc); + crpc->state = RPC_DEAD; + self->data.message_length = htonl(2000); + self->data.seg.offset = htonl(1400); + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, + 600, 1400), crpc); + EXPECT_EQ(600, crpc->msgin.bytes_remaining); + EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); + crpc->state = RPC_INCOMING; +} +TEST_F(homa_incoming, homa_data_pkt__initialize_msgin) { - struct unknown_header h = {{.sport = htons(self->server_port), - .dport = htons(self->client_port), - .sender_id = cpu_to_be64(self->server_id), - .type = UNKNOWN}}; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 2000, 2000); - ASSERT_NE(NULL, crpc); - homa_xmit_data(crpc, false); - unit_log_clear(); + self->server_port, self->client_id, 1000, 1600); - mock_xmit_log_verbose = 1; - homa_pkt_dispatch(mock_skb_new(self->server_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_STREQ("xmit DATA from 0.0.0.0:32768, dport 99, id 1234, " - "message_length 2000, offset 0, data_length 1400, " - "incoming 2000, RETRANSMIT; " - "xmit DATA from 0.0.0.0:32768, dport 99, id 1234, " - "message_length 2000, offset 1400, data_length 600, " - "incoming 2000, RETRANSMIT", - unit_log_get()); - EXPECT_EQ(-1, crpc->msgin.total_length); + ASSERT_NE(NULL, crpc); + self->data.message_length = htonl(1600); + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, + 1400, 0), crpc); + EXPECT_EQ(200, crpc->msgin.bytes_remaining); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(1600, crpc->msgin.granted); +#endif /* See strip.py */ } -TEST_F(homa_incoming, homa_unknown_pkt__client_resend_part) +TEST_F(homa_incoming, homa_data_pkt__no_buffer_pool) { - struct unknown_header h = {{.sport = htons(self->server_port), - .dport = htons(self->client_port), - .sender_id = cpu_to_be64(self->server_id), - .type = UNKNOWN}}; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 2000, 2000); + self->server_port, self->client_id, 1000, 1600); + ASSERT_NE(NULL, crpc); - crpc->msgout.granted = 1400; - homa_xmit_data(crpc, false); + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); unit_log_clear(); - - mock_xmit_log_verbose = 1; - homa_pkt_dispatch(mock_skb_new(self->server_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_STREQ("xmit DATA from 0.0.0.0:32768, dport 99, id 1234, " - "message_length 2000, offset 0, data_length 1400, " - "incoming 1400, RETRANSMIT", - unit_log_get()); - EXPECT_EQ(-1, crpc->msgin.total_length); + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, + 1400, 0), crpc); + EXPECT_STREQ("homa_data_pkt discarded packet", unit_log_get()); } -TEST_F(homa_incoming, homa_unknown_pkt__free_server_rpc) +TEST_F(homa_incoming, homa_data_pkt__wrong_server_rpc_state) { - struct unknown_header h = {{.sport = htons(self->client_port), - .dport = htons(self->server_port), - .sender_id = cpu_to_be64(self->client_id), - .type = UNKNOWN}}; struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 20000); + self->server_id, 1400, 5000); + ASSERT_NE(NULL, srpc); unit_log_clear(); - - homa_pkt_dispatch(mock_skb_new(self->client_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); + homa_data_pkt(mock_skb_alloc(self->client_ip, &self->data.common, + 1400, 0), srpc); + EXPECT_EQ(RPC_OUTGOING, srpc->state); + EXPECT_STREQ("homa_data_pkt discarded packet", unit_log_get()); } - -TEST_F(homa_incoming, homa_cutoffs_pkt_basics) +TEST_F(homa_incoming, homa_data_pkt__no_buffers) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); - EXPECT_EQ(11200, crpc->msgout.granted); + self->server_port, self->client_id, 1000, 5000); + + EXPECT_NE(NULL, crpc); unit_log_clear(); - struct cutoffs_header h = {{.sport = htons(self->server_port), - .dport = htons(self->client_port), - .sender_id = cpu_to_be64(self->server_id), - .type = CUTOFFS}, - .unsched_cutoffs = {htonl(10), htonl(9), htonl(8), - htonl(7), htonl(6), htonl(5), htonl(4), htonl(3)}, - .cutoff_version = 400}; - homa_pkt_dispatch(mock_skb_new(self->server_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_EQ(400, crpc->peer->cutoff_version); - EXPECT_EQ(9, crpc->peer->unsched_cutoffs[1]); - EXPECT_EQ(3, crpc->peer->unsched_cutoffs[7]); -} -TEST_F(homa_incoming, homa_cutoffs__cant_find_peer) -{ - struct homa_peer *peer; - struct cutoffs_header h = {{.sport = htons(self->server_port), - .dport = htons(self->client_port), - .sender_id = cpu_to_be64(self->server_id), - .type = CUTOFFS}, - .unsched_cutoffs = {htonl(10), htonl(9), htonl(8), - htonl(7), htonl(6), htonl(5), htonl(4), htonl(3)}, - .cutoff_version = 400}; - struct sk_buff *skb = mock_skb_new(self->server_ip, &h.common, 0, 0); - mock_kmalloc_errors = 1; - homa_cutoffs_pkt(skb, &self->hsk); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.peer_kmalloc_errors); - peer = homa_peer_find(&self->homa.peers, self->server_ip, - &self->hsk.inet); - ASSERT_FALSE(IS_ERR(peer)); - EXPECT_EQ(0, peer->cutoff_version); + atomic_set(&self->hsk.buffer_pool->free_bpages, 0); + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, + 1400, 0), crpc); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(1400, homa_metrics_per_cpu()->dropped_data_no_bufs); +#endif /* See strip.py */ + EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); } - -TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_fully_received) +TEST_F(homa_incoming, homa_data_pkt__update_delta) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id, 100, 3000); - ASSERT_NE(NULL, crpc); + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 1000, 5000); + + EXPECT_NE(NULL, crpc); unit_log_clear(); - mock_xmit_log_verbose = 1; - struct need_ack_header h = {.common = { - .sport = htons(self->server_port), - .dport = htons(self->client_port), - .sender_id = cpu_to_be64(self->server_id), - .type = NEED_ACK}}; - homa_pkt_dispatch(mock_skb_new(self->server_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_STREQ("xmit ACK from 0.0.0.0:40000, dport 99, id 1234, acks", - unit_log_get()); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.packets_received[ - NEED_ACK - DATA]); + + /* Total incoming goes up on first packet (count unscheduled bytes). */ + self->data.message_length = htonl(5000); +#ifndef __STRIP__ /* See strip.py */ + self->data.incoming = htonl(4000); +#endif /* See strip.py */ + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, + 1400, 0), crpc); + + /* Total incoming drops on subsequent packet. */ + self->data.seg.offset = htonl(2800); + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, + 1400, 2800), crpc); + + /* Duplicate packet should have no effect. */ + self->data.seg.offset = htonl(2800); + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, + 1400, 2800), crpc); } -TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_not_fully_received) +TEST_F(homa_incoming, homa_data_pkt__handoff) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 100, 3000); + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 1000, 3000); + ASSERT_NE(NULL, crpc); unit_log_clear(); - mock_xmit_log_verbose = 1; - struct need_ack_header h = {.common = { - .sport = htons(self->server_port), - .dport = htons(self->client_port), - .sender_id = cpu_to_be64(self->server_id), - .type = NEED_ACK}}; - homa_pkt_dispatch(mock_skb_new(self->server_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); + crpc->msgout.next_xmit_offset = crpc->msgout.length; + + /* First packet triggers handoff. */ + self->data.message_length = htonl(3000); + self->data.seg.offset = htonl(1400); + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, + 1400, 0), crpc); + EXPECT_EQ(1, unit_list_length(&self->hsk.ready_rpcs)); + EXPECT_TRUE(test_bit(RPC_PKTS_READY, &crpc->flags)); + EXPECT_EQ(1600, crpc->msgin.bytes_remaining); + EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); + EXPECT_STREQ("sk->sk_data_ready invoked", unit_log_get()); + + /* Second packet doesn't trigger a handoff because one is + * already pending. + */ + self->data.message_length = htonl(3000); + self->data.seg.offset = htonl(2800); + unit_log_clear(); + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, + 200, 0), crpc); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.packets_received[ - NEED_ACK - DATA]); } -TEST_F(homa_incoming, homa_need_ack_pkt__rpc_not_incoming) +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_incoming, homa_data_pkt__send_cutoffs) { - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 100, 3000); - ASSERT_NE(NULL, crpc); - unit_log_clear(); + self->homa.cutoff_version = 2; + self->homa.unsched_cutoffs[0] = 19; + self->homa.unsched_cutoffs[1] = 18; + self->homa.unsched_cutoffs[2] = 17; + self->homa.unsched_cutoffs[3] = 16; + self->homa.unsched_cutoffs[4] = 15; + self->homa.unsched_cutoffs[5] = 14; + self->homa.unsched_cutoffs[6] = 13; + self->homa.unsched_cutoffs[7] = 12; + self->data.message_length = htonl(5000); mock_xmit_log_verbose = 1; - struct need_ack_header h = {.common = { - .sport = htons(self->server_port), - .dport = htons(self->client_port), - .sender_id = cpu_to_be64(self->server_id), - .type = NEED_ACK}}; - homa_pkt_dispatch(mock_skb_new(self->server_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, + 1400, 0)); + EXPECT_SUBSTR("cutoffs 19 18 17 16 15 14 13 12, version 2", + unit_log_get()); + + /* Try again, but this time no comments should be sent because + * no time has elapsed since the last cutoffs were sent. + */ + unit_log_clear(); + self->homa.cutoff_version = 3; + self->data.seg.offset = 1400; + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, + 1400, 0)); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.packets_received[ - NEED_ACK - DATA]); } -TEST_F(homa_incoming, homa_need_ack_pkt__rpc_doesnt_exist) +TEST_F(homa_incoming, homa_data_pkt__cutoffs_up_to_date) { - struct homa_peer *peer = homa_peer_find(&self->homa.peers, - self->server_ip, &self->hsk.inet); - peer->acks[0].client_port = htons(self->client_port); - peer->acks[0].server_port = htons(self->server_port); - peer->acks[0].client_id = cpu_to_be64(self->client_id+2); - peer->num_acks = 1; - mock_xmit_log_verbose = 1; - struct need_ack_header h = {.common = { - .sport = htons(self->server_port), - .dport = htons(self->client_port), - .sender_id = cpu_to_be64(self->server_id), - .type = NEED_ACK}}; - homa_pkt_dispatch(mock_skb_new(self->server_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_STREQ("xmit ACK from 0.0.0.0:40000, dport 99, id 1234, " - "acks [cp 40000, sp 99, id 1236]", unit_log_get()); + self->homa.cutoff_version = 123; + self->data.cutoff_version = htons(123); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, + 1400, 0)); + EXPECT_STREQ("sk->sk_data_ready invoked", unit_log_get()); } -TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists) +TEST_F(homa_incoming, homa_grant_pkt__basics) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 5000); - ASSERT_NE(NULL, srpc); - EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); - unit_log_clear(); - mock_xmit_log_verbose = 1; - struct ack_header h = {.common = { - .sport = htons(self->client_port), - .dport = htons(self->server_port), - .sender_id = cpu_to_be64(self->client_id), - .type = ACK}, - .num_acks = htons(0)}; - homa_pkt_dispatch(mock_skb_new(self->client_ip, &h.common, 0, 0), - &self->hsk, &self->lcache, &self->incoming_delta); - EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.packets_received[ - ACK - DATA]); -} -TEST_F(homa_incoming, homa_ack_pkt__target_rpc_doesnt_exist) -{ - struct homa_sock hsk1; - mock_sock_init(&hsk1, &self->homa, self->server_port); - struct homa_rpc *srpc1 = unit_server_rpc(&hsk1, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 5000); - struct homa_rpc *srpc2 = unit_server_rpc(&hsk1, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id+2, 100, 5000); - ASSERT_NE(NULL, srpc1); - ASSERT_NE(NULL, srpc2); - EXPECT_EQ(2, unit_list_length(&hsk1.active_rpcs)); - unit_log_clear(); - mock_xmit_log_verbose = 1; - struct ack_header h = {.common = { - .sport = htons(self->client_port + 1), - .dport = htons(self->server_port), + self->server_id, 100, 20000); + struct homa_grant_hdr h = {{.sport = htons(srpc->dport), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->client_id), - .type = ACK}, - .num_acks = htons(2)}; - h.acks[0] = (struct homa_ack) {.client_port = htons(self->client_port), - .server_port = htons(self->server_port), - .client_id = cpu_to_be64(self->server_id+5)}; - h.acks[1] = (struct homa_ack) {.client_port = htons(self->client_port), - .server_port = htons(self->server_port), - .client_id = cpu_to_be64(self->server_id+1)}; - homa_pkt_dispatch(mock_skb_new(self->client_ip, &h.common, 0, 0), - &hsk1, &self->lcache, &self->incoming_delta); - EXPECT_EQ(1, unit_list_length(&hsk1.active_rpcs)); - EXPECT_STREQ("OUTGOING", homa_symbol_for_state(srpc1)); - EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc2)); - homa_sock_shutdown(&hsk1); -} + .type = GRANT}, + .offset = htonl(11000), + .priority = 3}; -TEST_F(homa_incoming, homa_check_grantable__not_ready_for_grant) -{ - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 5000, 100); ASSERT_NE(NULL, srpc); + homa_rpc_lock(srpc); + XMIT_DATA(srpc, false); + homa_rpc_unlock(srpc); unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("", unit_log_get()); - srpc->msgin.total_length = 20000; - srpc->msgin.bytes_remaining = 15000; - srpc->msgin.incoming = 18000; - homa_check_grantable(&self->homa, srpc); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); + EXPECT_EQ(11000, srpc->msgout.granted); + EXPECT_STREQ("xmit DATA 1400@10000", unit_log_get()); + + /* Don't let grant offset go backwards. */ + h.offset = htonl(10000); unit_log_clear(); - unit_log_grantables(&self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); + EXPECT_EQ(11000, srpc->msgout.granted); EXPECT_STREQ("", unit_log_get()); - srpc->msgin.incoming = 20000; - homa_check_grantable(&self->homa, srpc); + /* Wrong state. */ + h.offset = htonl(20000); + srpc->state = RPC_INCOMING; unit_log_clear(); - unit_log_grantables(&self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); + EXPECT_EQ(11000, srpc->msgout.granted); EXPECT_STREQ("", unit_log_get()); - srpc->msgin.incoming = 18000; - srpc->msgin.bytes_remaining = 10000; - homa_check_grantable(&self->homa, srpc); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 1235, remaining 10000", - unit_log_get()); -} -TEST_F(homa_incoming, homa_check_grantable__insert_in_peer_list) -{ - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 100000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 3, 50000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 5, 120000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 7, 70000, 100); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 3, remaining 48600; " - "request from 196.168.0.1, id 7, remaining 68600; " - "request from 196.168.0.1, id 1, remaining 98600; " - "request from 196.168.0.1, id 5, remaining 118600", - unit_log_get()); - EXPECT_EQ(1, self->homa.num_grantable_peers); + /* Must restore old state to avoid potential crashes. */ + srpc->state = RPC_OUTGOING; } -TEST_F(homa_incoming, homa_check_grantable__adjust_order_in_peer_list) +TEST_F(homa_incoming, homa_grant_pkt__grant_past_end_of_message) { - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 20000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 3, 30000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 5, 40000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 7, 50000, 100); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 1, remaining 18600; " - "request from 196.168.0.1, id 3, remaining 28600; " - "request from 196.168.0.1, id 5, remaining 38600; " - "request from 196.168.0.1, id 7, remaining 48600", - unit_log_get()); - - struct homa_rpc *srpc = homa_find_server_rpc(&self->hsk, - self->client_ip, self->client_port, 5); - ASSERT_NE(NULL, srpc); - homa_rpc_unlock(srpc); - srpc->msgin.bytes_remaining = 28600; - homa_check_grantable(&self->homa, srpc); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 1, remaining 18600; " - "request from 196.168.0.1, id 3, remaining 28600; " - "request from 196.168.0.1, id 5, remaining 28600; " - "request from 196.168.0.1, id 7, remaining 48600", - unit_log_get()); - - srpc->msgin.bytes_remaining = 28599; - homa_check_grantable(&self->homa, srpc); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 1, remaining 18600; " - "request from 196.168.0.1, id 5, remaining 28599; " - "request from 196.168.0.1, id 3, remaining 28600; " - "request from 196.168.0.1, id 7, remaining 48600", - unit_log_get()); + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 20000, 1600); + struct homa_grant_hdr h = {{.sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(self->server_id), + .type = GRANT}, + .offset = htonl(25000), + .priority = 3}; - srpc = homa_find_server_rpc(&self->hsk, self->client_ip, - self->client_port, 7); - ASSERT_NE(NULL, srpc); - homa_rpc_unlock(srpc);; - srpc->msgin.bytes_remaining = 1000; - homa_check_grantable(&self->homa, srpc); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 7, remaining 1000; " - "request from 196.168.0.1, id 1, remaining 18600; " - "request from 196.168.0.1, id 5, remaining 28599; " - "request from 196.168.0.1, id 3, remaining 28600", - unit_log_get()); -} -TEST_F(homa_incoming, homa_check_grantable__age_tiebreaker_in_peer_list) -{ - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 20000, 100); - struct homa_rpc *srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip, self->server_ip, self->client_port, - 3, 30000, 100); - struct homa_rpc *srpc3 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip, self->server_ip, self->client_port, - 5, 30000, 100); - struct homa_rpc *srpc4 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip, self->server_ip, self->client_port, - 7, 50000, 100); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 1, remaining 18600; " - "request from 196.168.0.1, id 3, remaining 28600; " - "request from 196.168.0.1, id 5, remaining 28600; " - "request from 196.168.0.1, id 7, remaining 48600", - unit_log_get()); - srpc4->msgin.bytes_remaining = 28600; - srpc4->msgin.birth = 1000; - srpc3->msgin.birth = 2000; - srpc2->msgin.birth = 500; - homa_check_grantable(&self->homa, srpc4); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 1, remaining 18600; " - "request from 196.168.0.1, id 3, remaining 28600; " - "request from 196.168.0.1, id 7, remaining 28600; " - "request from 196.168.0.1, id 5, remaining 28600", - unit_log_get()); -} -TEST_F(homa_incoming, homa_check_grantable__insert_in_homa_list) -{ - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 100000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+1, - self->server_ip, self->client_port, 3, 50000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+2, - self->server_ip, self->client_port, 5, 120000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+3, - self->server_ip, self->client_port, 7, 70000, 100); + ASSERT_NE(NULL, crpc); unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 197.168.0.1, id 3, remaining 48600; " - "request from 199.168.0.1, id 7, remaining 68600; " - "request from 196.168.0.1, id 1, remaining 98600; " - "request from 198.168.0.1, id 5, remaining 118600", - unit_log_get()); - EXPECT_EQ(4, self->homa.num_grantable_peers); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); + EXPECT_EQ(20000, crpc->msgout.granted); } -TEST_F(homa_incoming, homa_check_grantable__age_tiebreaker_inserting_in_homa_list) +#endif /* See strip.py */ + +TEST_F(homa_incoming, homa_resend_pkt__unknown_rpc) { - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 20000, 100); - struct homa_rpc *srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip+1, self->server_ip, self->client_port, - 3, 30000, 100); - struct homa_rpc *srpc3 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip+2, self->server_ip, self->client_port, - 5, 30000, 100); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 1, remaining 18600; " - "request from 197.168.0.1, id 3, remaining 28600; " - "request from 198.168.0.1, id 5, remaining 28600", - unit_log_get()); + struct homa_resend_hdr h = {{.sport = htons(self->client_port), + .dport = htons(self->server_port), + .sender_id = cpu_to_be64(self->client_id), + .type = RESEND}, + .offset = htonl(100), + .length = htonl(200)}; - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip+1, - self->server_port, self->client_id, 1400, 30000); - srpc2->msgin.birth = 1000; - srpc3->msgin.birth = 2000; - mock_cycles = 1500; - self->data.message_length = htonl(30000); - homa_data_pkt(mock_skb_new(self->server_ip+1, &self->data.common, - 1400, 0), crpc, NULL, &self->incoming_delta); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 1, remaining 18600; " - "request from 197.168.0.1, id 3, remaining 28600; " - "response from 2.2.3.4, id 1234, remaining 28600; " - "request from 198.168.0.1, id 5, remaining 28600", - unit_log_get()); - EXPECT_EQ(4, self->homa.num_grantable_peers); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); + EXPECT_STREQ("xmit RPC_UNKNOWN", unit_log_get()); } -TEST_F(homa_incoming, homa_check_grantable__move_upward_in_homa_list) +TEST_F(homa_incoming, homa_resend_pkt__rpc_in_service_server_sends_busy) { - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 20000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+1, - self->server_ip, self->client_port, 3, 30000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+2, - self->server_ip, self->client_port, 5, 40000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+3, - self->server_ip, self->client_port, 7, 50000, 100); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 1, remaining 18600; " - "request from 197.168.0.1, id 3, remaining 28600; " - "request from 198.168.0.1, id 5, remaining 38600; " - "request from 199.168.0.1, id 7, remaining 48600", - unit_log_get()); + struct homa_resend_hdr h = {{.sport = htons(self->client_port), + .dport = htons(self->server_port), + .sender_id = cpu_to_be64(self->client_id), + .type = RESEND}, + .offset = htonl(0), + .length = htonl(200)}; + struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_IN_SERVICE, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 2000, 20000); - struct homa_rpc *srpc = homa_find_server_rpc(&self->hsk, - self->client_ip+2, self->client_port, 5); ASSERT_NE(NULL, srpc); - homa_rpc_unlock(srpc); - srpc->msgin.bytes_remaining = 28600; - homa_check_grantable(&self->homa, srpc); unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 1, remaining 18600; " - "request from 197.168.0.1, id 3, remaining 28600; " - "request from 198.168.0.1, id 5, remaining 28600; " - "request from 199.168.0.1, id 7, remaining 48600", - unit_log_get()); - srpc->msgin.bytes_remaining = 28599; - homa_check_grantable(&self->homa, srpc); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 1, remaining 18600; " - "request from 198.168.0.1, id 5, remaining 28599; " - "request from 197.168.0.1, id 3, remaining 28600; " - "request from 199.168.0.1, id 7, remaining 48600", - unit_log_get()); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); + EXPECT_STREQ("xmit BUSY", unit_log_get()); +} +TEST_F(homa_incoming, homa_resend_pkt__rpc_incoming_server_sends_busy) +{ + /* Entire msgin has not been received yet. But we have received + * everything we have granted so far. + */ + struct homa_resend_hdr h = {{.sport = htons(self->client_port), + .dport = htons(self->server_port), + .sender_id = cpu_to_be64(self->client_id), + .type = RESEND}, + .offset = htonl(1400), + .length = htonl(200)}; + struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_RCVD_ONE_PKT, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 2000, 20000); - srpc = homa_find_server_rpc(&self->hsk, self->client_ip+3, - self->client_port, 7); ASSERT_NE(NULL, srpc); - homa_rpc_unlock(srpc);; - srpc->msgin.bytes_remaining = 1000; - homa_check_grantable(&self->homa, srpc); +#ifndef __STRIP__ /* See strip.py */ + srpc->msgin.granted = 1400; +#endif /* See strip.py */ unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 199.168.0.1, id 7, remaining 1000; " - "request from 196.168.0.1, id 1, remaining 18600; " - "request from 198.168.0.1, id 5, remaining 28599; " - "request from 197.168.0.1, id 3, remaining 28600", - unit_log_get()); + + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); + // The server might send a GRANT right after BUSY so just check substr + EXPECT_SUBSTR("xmit BUSY", unit_log_get()); } -TEST_F(homa_incoming, homa_check_grantable__age_tiebreaker_moving_upward_in_homa_list) +TEST_F(homa_incoming, homa_resend_pkt__negative_length_in_resend) { - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 20000, 100); - struct homa_rpc *srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip+1, self->server_ip, self->client_port, - 3, 30000, 100); - struct homa_rpc *srpc3 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip+2, self->server_ip, self->client_port, - 5, 30000, 100); - struct homa_rpc *srpc4 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip+3, self->server_ip, self->client_port, - 7, 50000, 100); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 1, remaining 18600; " - "request from 197.168.0.1, id 3, remaining 28600; " - "request from 198.168.0.1, id 5, remaining 28600; " - "request from 199.168.0.1, id 7, remaining 48600", - unit_log_get()); + struct homa_resend_hdr h = {{.sport = htons(self->client_port), + .dport = htons(self->server_port), + .sender_id = cpu_to_be64(self->client_id), + .type = RESEND}, + .offset = htonl(0), + .length = htonl(-1)}; + struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 2000, 20000); - srpc2->msgin.birth = 1000; - srpc3->msgin.birth = 2000; - srpc4->msgin.birth = 1500; - srpc4->msgin.bytes_remaining = 28600; - homa_check_grantable(&self->homa, srpc4); + ASSERT_NE(NULL, srpc); unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 1, remaining 18600; " - "request from 197.168.0.1, id 3, remaining 28600; " - "request from 199.168.0.1, id 7, remaining 28600; " - "request from 198.168.0.1, id 5, remaining 28600", - unit_log_get()); -} + srpc->msgout.next_xmit_offset = 2000; -TEST_F(homa_incoming, homa_send_grants__basics) + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); + EXPECT_STREQ("xmit DATA retrans 1400@0; " + "xmit DATA retrans 1400@1400", unit_log_get()); +} +TEST_F(homa_incoming, homa_resend_pkt__client_not_outgoing) { - struct homa_rpc *srpc1, *srpc2, *srpc3, *srpc4; - srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 20000, 100); - srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+1, - self->server_ip, self->client_port, 3, 30000, 100); - srpc3 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+2, - self->server_ip, self->client_port, 5, 40000, 100); - srpc4 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+3, - self->server_ip, self->client_port, 7, 50000, 100); - EXPECT_EQ(34400, atomic_read(&self->homa.total_incoming)); + /* Important to respond to resends even if client thinks the + * server must already have received everything. + */ + struct homa_resend_hdr h = {{.sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(self->server_id), + .type = RESEND}, + .offset = htonl(100), + .length = htonl(200)}; + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 2000, 3000); - /* First attempt: no headroom for grants */ - self->homa.max_incoming = 30000; + ASSERT_NE(NULL, crpc); unit_log_clear(); - homa_send_grants(&self->homa); - EXPECT_STREQ("", unit_log_get()); - /* Second attempt: can grant only the first message and part of - * the second. */ - self->homa.max_incoming = 36000; + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); + EXPECT_STREQ("xmit DATA retrans 1400@0", unit_log_get()); +} +TEST_F(homa_incoming, homa_resend_pkt__clip_range_to_tx_end) +{ + struct homa_resend_hdr h = {{.sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(self->server_id), + .type = RESEND}, + .offset = htonl(100), + .length = htonl(2000)}; + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 100); + + ASSERT_NE(NULL, crpc); unit_log_clear(); - homa_send_grants(&self->homa); - EXPECT_STREQ("xmit GRANT 11400@3; xmit GRANT 10200@2", unit_log_get()); - EXPECT_EQ(11400, srpc1->msgin.incoming); + crpc->msgout.next_xmit_offset = 1400; + IF_NO_STRIP(crpc->msgout.granted = 5000); - /* Third attempt: finish granting to second message. */ + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); + EXPECT_STREQ("xmit DATA retrans 1400@0", unit_log_get()); +} +TEST_F(homa_incoming, homa_resend_pkt__no_need_to_clip_range) +{ + struct homa_resend_hdr h = {{.sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(self->server_id), + .type = RESEND}, + .offset = htonl(100), + .length = htonl(300)}; + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 100); - self->homa.max_incoming = 37200; + ASSERT_NE(NULL, crpc); unit_log_clear(); - homa_send_grants(&self->homa); - EXPECT_STREQ("xmit GRANT 11400@2", unit_log_get()); + IF_NO_STRIP(crpc->msgout.granted = 2800); + crpc->msgout.next_xmit_offset = 2800; + + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); + EXPECT_STREQ("xmit DATA retrans 1400@0", unit_log_get()); +} +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_incoming, homa_resend_pkt__update_granted_and_xmit) +{ + struct homa_resend_hdr h = {{.sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(self->server_id), + .type = RESEND}, + .offset = htonl(1400), + .length = htonl(2000)}; + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 100); - /* Try again (no new grants, since nothing has changed). */ + ASSERT_NE(NULL, crpc); + crpc->msgout.granted = 1400; + homa_rpc_lock(crpc); + XMIT_DATA(crpc, false); + homa_rpc_unlock(crpc); unit_log_clear(); - homa_send_grants(&self->homa); - EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(1400, crpc->msgout.next_xmit_offset); - /* Now create enough headroom for all of the messages. */ - self->homa.max_incoming = 50000; - unit_log_clear(); - homa_send_grants(&self->homa); - EXPECT_STREQ("xmit GRANT 11400@1; xmit GRANT 11400@0", unit_log_get()); - EXPECT_EQ(11400, srpc2->msgin.incoming); - EXPECT_EQ(11400, srpc3->msgin.incoming); - EXPECT_EQ(11400, srpc4->msgin.incoming); - EXPECT_EQ(40000, atomic_read(&self->homa.total_incoming)); -} -TEST_F(homa_incoming, homa_send_grants__enlarge_window) -{ - struct homa_rpc *srpc1, *srpc2; - srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 40000, 100); - srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+1, - self->server_ip, self->client_port, 3, 40000, 100); - EXPECT_EQ(17200, atomic_read(&self->homa.total_incoming)); - - self->homa.max_incoming = 40000; - self->homa.max_grant_window = 40000; - unit_log_clear(); - homa_send_grants(&self->homa); - EXPECT_STREQ("xmit GRANT 16400@1; xmit GRANT 16400@0", unit_log_get()); - EXPECT_EQ(16400, srpc1->msgin.incoming); - EXPECT_EQ(16400, srpc2->msgin.incoming); - EXPECT_EQ(30000, atomic_read(&self->homa.total_incoming)); -} -TEST_F(homa_incoming, homa_send_grants__one_grant_per_peer) -{ - struct homa_rpc *srpc1, *srpc2, *srpc3, *srpc4; - srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 20000, 100); - srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 3, 30000, 100); - srpc3 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 5, 40000, 100); - srpc4 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+1, - self->server_ip, self->client_port, 7, 50000, 100); - srpc1->msgin.incoming = 1400; - srpc2->msgin.incoming = 1400; - srpc3->msgin.incoming = 1400; - srpc4->msgin.incoming = 1400; - atomic_set(&self->homa.total_incoming, 0); - self->homa.max_incoming = 25000; - homa_send_grants(&self->homa); - EXPECT_EQ(11400, srpc1->msgin.incoming); - EXPECT_EQ(1400, srpc2->msgin.incoming); - EXPECT_EQ(1400, srpc3->msgin.incoming); - EXPECT_EQ(11400, srpc4->msgin.incoming); -} -TEST_F(homa_incoming, homa_send_grants__truncate_grant_to_message_length) + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); + EXPECT_EQ(3400, crpc->msgout.granted); + EXPECT_EQ(4200, crpc->msgout.next_xmit_offset); +} +TEST_F(homa_incoming, homa_resend_pkt__clip_granted_to_message_length) { - struct homa_rpc *srpc; - srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 11000, 100); - EXPECT_NE(NULL, srpc); - EXPECT_EQ(8600, atomic_read(&self->homa.total_incoming)); - - self->homa.max_incoming = 50000; - unit_log_clear(); - homa_send_grants(&self->homa); - EXPECT_STREQ("xmit GRANT 11000@0", unit_log_get()); - EXPECT_EQ(11000, srpc->msgin.incoming); - EXPECT_EQ(9600, atomic_read(&self->homa.total_incoming)); -} -TEST_F(homa_incoming, homa_send_grants__choose_priority_level) -{ - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 40000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+1, - self->server_ip, self->client_port, 3, 30000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+2, - self->server_ip, self->client_port, 5, 20000, 100); - atomic_set(&self->homa.total_incoming, 0); - self->homa.max_incoming = 30000; - homa_send_grants(&self->homa); - EXPECT_SUBSTR("xmit GRANT 11400@2; " - "xmit GRANT 11400@1; " - "xmit GRANT 11400@0", unit_log_get()); -} -TEST_F(homa_incoming, homa_send_grants__share_lowest_priority_level) -{ - struct homa_rpc *srpc1, *srpc2, *srpc3, *srpc4; - srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 20000, 100); - srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+1, - self->server_ip, self->client_port, 3, 30000, 100); - srpc3 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+2, - self->server_ip, self->client_port, 5, 40000, 100); - srpc4 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+3, - self->server_ip, self->client_port, 5, 50000, 100); - srpc1->msgin.incoming = 15000; - atomic_set(&self->homa.total_incoming, 0); - self->homa.max_incoming = 30000; - self->homa.max_sched_prio = 2; - homa_send_grants(&self->homa); - EXPECT_SUBSTR("xmit GRANT 11400@1; " - "xmit GRANT 11400@0; " - "xmit GRANT 11400@0", unit_log_get()); - EXPECT_EQ(11400, srpc2->msgin.incoming); - EXPECT_EQ(11400, srpc3->msgin.incoming); - EXPECT_EQ(11400, srpc4->msgin.incoming); -} -TEST_F(homa_incoming, homa_send_grants__remove_from_grantable) -{ - struct homa_rpc *srpc1, *srpc2, *srpc3; - srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 11000, 100); - srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+1, - self->server_ip, self->client_port, 3, 30000, 100); - srpc3 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 5, 20000, 100); - atomic_set(&self->homa.total_incoming, 0); - self->homa.max_incoming = 3000; - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 1, remaining 9600; " - "request from 196.168.0.1, id 5, remaining 18600; " - "request from 197.168.0.1, id 3, remaining 28600", - unit_log_get()); + struct homa_resend_hdr h = {{.sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(self->server_id), + .type = RESEND}, + .offset = htonl(1400), + .length = htonl(6000)}; + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 100); - /* First attempt grants to one message per host. */ - unit_log_clear(); - homa_send_grants(&self->homa); - EXPECT_STREQ("xmit GRANT 11000@1; xmit GRANT 11400@0", unit_log_get()); - EXPECT_EQ(11000, srpc1->msgin.incoming); - EXPECT_EQ(11400, srpc2->msgin.incoming); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 5, remaining 18600; " - "request from 197.168.0.1, id 3, remaining 28600", - unit_log_get()); + ASSERT_NE(NULL, crpc); - /* Second attempt will now get second message from host. */ - unit_log_clear(); - homa_send_grants(&self->homa); - EXPECT_STREQ("xmit GRANT 10600@1", unit_log_get()); - EXPECT_EQ(10600, srpc3->msgin.incoming); -} -TEST_F(homa_incoming, homa_send_grants__MAX_GRANTS_exceeded) -{ - mock_max_grants = 3; - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 20000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+1, - self->server_ip, self->client_port, 32, 30000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+2, - self->server_ip, self->client_port, 5, 40000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+3, - self->server_ip, self->client_port, 7, 50000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+4, - self->server_ip, self->client_port, 9, 60000, 100); - atomic_set(&self->homa.total_incoming, 0); - self->homa.max_incoming = 10000; - unit_log_clear(); - homa_send_grants(&self->homa); - EXPECT_STREQ("xmit GRANT 11400@3; xmit GRANT 11400@2; " - "xmit GRANT 11400@1", unit_log_get()); -} -TEST_F(homa_incoming, homa_send_grants__grant_fifo) -{ - struct homa_rpc *srpc1, *srpc2; - self->homa.fifo_grant_increment = 5000; - self->homa.grant_fifo_fraction = 100; - self->homa.grant_nonfifo_left = 6000; - self->homa.grant_nonfifo = 10000; - self->homa.max_overcommit = 1; - self->homa.max_incoming = 10000; - mock_cycles = ~0; - srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 30000, 100); - ASSERT_NE(NULL, srpc1); - EXPECT_EQ(10000, srpc1->msgin.incoming); - srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+1, - self->server_ip, self->client_port, 1, 20000, 100); - ASSERT_NE(NULL, srpc2); - srpc2->msgin.incoming = 9000; - atomic_set(&self->homa.total_incoming, 7600); - - /* First call: not time for FIFO grants yet. */ - unit_log_clear(); - homa_send_grants(&self->homa); - EXPECT_STREQ("xmit GRANT 11400@1", unit_log_get()); - EXPECT_EQ(11400, srpc2->msgin.incoming); - EXPECT_EQ(3600, self->homa.grant_nonfifo_left); - EXPECT_EQ(10000, atomic_read(&self->homa.total_incoming)); - - /* Second call: time for a FIFO grant. */ - unit_log_clear(); - srpc2->msgin.incoming = 5000; - atomic_set(&self->homa.total_incoming, 5400); - homa_send_grants(&self->homa); - EXPECT_STREQ("xmit GRANT 15000@3; xmit GRANT 9600@1", unit_log_get()); - EXPECT_EQ(15000, srpc1->msgin.incoming); - EXPECT_EQ(9600, srpc2->msgin.incoming); - EXPECT_EQ(9000, self->homa.grant_nonfifo_left); - EXPECT_EQ(15000, atomic_read(&self->homa.total_incoming)); - - /* Third call: time for a FIFO grant, but FIFO fraction is zero. */ - unit_log_clear(); - srpc1->msgin.incoming = 5000; - srpc2->msgin.incoming = 5000; - atomic_set(&self->homa.total_incoming, 8000); - self->homa.grant_nonfifo_left = 1000; - self->homa.grant_fifo_fraction = 0; - homa_send_grants(&self->homa); - EXPECT_STREQ("xmit GRANT 7000@1", unit_log_get()); - EXPECT_EQ(7000, srpc2->msgin.incoming); - EXPECT_EQ(9000, self->homa.grant_nonfifo_left); -} -TEST_F(homa_incoming, homa_send_grants__dont_grant_fifo_no_inactive_rpcs) -{ - struct homa_rpc *srpc1, *srpc2; - self->homa.rtt_bytes = 10000; - self->homa.fifo_grant_increment = 5000; - self->homa.max_sched_prio = 3; - self->homa.grant_fifo_fraction = 100; - self->homa.grant_nonfifo_left = 1000; - self->homa.grant_nonfifo = 10000; - self->homa.max_overcommit = 2; - self->homa.max_incoming = 10000; - mock_cycles = ~0; - srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 30000, 100); - ASSERT_NE(NULL, srpc1); - srpc1->msgin.incoming = 10000; - srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+1, - self->server_ip, self->client_port, 1, 20000, 100); - ASSERT_NE(NULL, srpc2); - srpc2->msgin.incoming = 9000; - atomic_set(&self->homa.total_incoming, 8000); + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); + EXPECT_EQ(5000, crpc->msgout.granted); +} +#endif /* See strip.py */ +TEST_F(homa_incoming, homa_resend_pkt__requested_data_hasnt_been_sent_yet) +{ + struct homa_resend_hdr h = {{.sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(self->server_id), + .type = RESEND}, + .offset = htonl(100), + .length = htonl(200)}; + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 2000, 100); + ASSERT_NE(NULL, crpc); + unit_reset_tx(crpc); unit_log_clear(); - homa_send_grants(&self->homa); - EXPECT_STREQ("xmit GRANT 11000@1", unit_log_get()); - EXPECT_EQ(10000, srpc1->msgin.incoming); - EXPECT_EQ(11000, srpc2->msgin.incoming); - EXPECT_EQ(9000, self->homa.grant_nonfifo_left); + + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); + EXPECT_SUBSTR("xmit BUSY", unit_log_get()); } -TEST_F(homa_incoming, homa_grant_fifo__basics) +TEST_F(homa_incoming, homa_unknown_pkt__client_resend_all) { - struct homa_rpc *srpc; - self->homa.rtt_bytes = 10000; - self->homa.fifo_grant_increment = 5000; - self->homa.max_sched_prio = 2; - mock_cycles = ~0; - srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 40000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+1, - self->server_ip, self->client_port, 3, 30000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 5, 20000, 100); - ASSERT_NE(NULL, srpc); - EXPECT_EQ(10000, srpc->msgin.incoming); - - unit_log_clear(); - EXPECT_EQ(5000, homa_grant_fifo(&self->homa)); - EXPECT_STREQ("xmit GRANT 15000@2", unit_log_get()); - EXPECT_EQ(15000, srpc->msgin.incoming); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.fifo_grants); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.fifo_grants_no_incoming); -} -TEST_F(homa_incoming, homa_grant_fifo__pity_grant_still_active) -{ - struct homa_rpc *srpc1, *srpc2; - self->homa.rtt_bytes = 10000; - self->homa.fifo_grant_increment = 5000; - self->homa.max_sched_prio = 2; - mock_cycles = ~0; - srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 40000, 100); - srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+1, - self->server_ip, self->client_port, 3, 30000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 5, 20000, 100); - ASSERT_NE(NULL, srpc1); - ASSERT_NE(NULL, srpc2); - srpc1->msgin.incoming = 16400; + struct homa_rpc_unknown_hdr h = {{.sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(self->server_id), + .type = RPC_UNKNOWN}}; + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 2000, 2000); + ASSERT_NE(NULL, crpc); + homa_rpc_lock(crpc); + XMIT_DATA(crpc, false); + homa_rpc_unlock(crpc); unit_log_clear(); - EXPECT_EQ(5000, homa_grant_fifo(&self->homa)); - EXPECT_STREQ("xmit GRANT 15000@2", unit_log_get()); - EXPECT_EQ(16400, srpc1->msgin.incoming); - EXPECT_EQ(15000, srpc2->msgin.incoming); + + mock_xmit_log_verbose = 1; + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_SUBSTR("xmit DATA from 0.0.0.0:32768, dport 99, id 1234, message_length 2000, offset 0, data_length 1400, incoming 2000, RETRANSMIT; " + "xmit DATA from 0.0.0.0:32768, dport 99, id 1234, message_length 2000, offset 1400, data_length 600, incoming 2000, RETRANSMIT", + unit_log_get()); +#else /* See strip.py */ + EXPECT_SUBSTR("xmit DATA from 0.0.0.0:32768, dport 99, id 1234, message_length 2000, offset 0, data_length 1400, RETRANSMIT; " + "xmit DATA from 0.0.0.0:32768, dport 99, id 1234, message_length 2000, offset 1400, data_length 600, RETRANSMIT", + unit_log_get()); +#endif /* See strip.py */ + EXPECT_EQ(-1, crpc->msgin.length); } -TEST_F(homa_incoming, homa_grant_fifo__no_good_candidates) +TEST_F(homa_incoming, homa_unknown_pkt__client_resend_part) { - struct homa_rpc *srpc1; - self->homa.rtt_bytes = 10000; - self->homa.fifo_grant_increment = 5000; - self->homa.max_sched_prio = 2; - mock_cycles = ~0; - srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 40000, 100); - ASSERT_NE(NULL, srpc1); - srpc1->msgin.incoming = 16400; + struct homa_rpc_unknown_hdr h = {{.sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(self->server_id), + .type = RPC_UNKNOWN}}; + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 2000, 2000); + ASSERT_NE(NULL, crpc); +#ifndef __STRIP__ /* See strip.py */ + crpc->msgout.granted = 1400; +#endif /* See strip.py */ + homa_rpc_lock(crpc); + XMIT_DATA(crpc, false); + homa_rpc_unlock(crpc); unit_log_clear(); - EXPECT_EQ(0, homa_grant_fifo(&self->homa)); - EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(16400, srpc1->msgin.incoming); + + mock_xmit_log_verbose = 1; + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_SUBSTR("xmit DATA from 0.0.0.0:32768, dport 99, id 1234, message_length 2000, offset 0, data_length 1400, incoming 1400, RETRANSMIT", + unit_log_get()); +#else /* See strip.py */ + EXPECT_SUBSTR("xmit DATA from 0.0.0.0:32768, dport 99, id 1234, message_length 2000, offset 0, data_length 1400, RETRANSMIT", + unit_log_get()); +#endif /* See strip.py */ + EXPECT_EQ(-1, crpc->msgin.length); } -TEST_F(homa_incoming, homa_grant_fifo__increment_fifo_grants_no_incoming) +TEST_F(homa_incoming, homa_unknown_pkt__free_server_rpc) { - struct homa_rpc *srpc1; - self->homa.rtt_bytes = 10000; - self->homa.fifo_grant_increment = 5000; - self->homa.max_sched_prio = 2; - mock_cycles = ~0; - srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 40000, 100); - ASSERT_NE(NULL, srpc1); - srpc1->msgin.incoming = 1400; + struct homa_rpc_unknown_hdr h = {{.sport = htons(self->client_port), + .dport = htons(self->hsk2.port), + .sender_id = cpu_to_be64(self->client_id), + .type = RPC_UNKNOWN}}; + struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 100, 20000); + ASSERT_NE(NULL, srpc); unit_log_clear(); - EXPECT_EQ(5000, homa_grant_fifo(&self->homa)); - EXPECT_STREQ("xmit GRANT 6400@2", unit_log_get()); - EXPECT_EQ(6400, srpc1->msgin.incoming); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.fifo_grants_no_incoming); + + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); + EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); } -TEST_F(homa_incoming, homa_grant_fifo__remove_from_grantable) + +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_incoming, homa_cutoffs_pkt_basics) { - struct homa_rpc *srpc1; - self->homa.rtt_bytes = 10000; - self->homa.fifo_grant_increment = 5000; - self->homa.max_sched_prio = 2; - mock_cycles = ~0; - srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 14000, 100); - ASSERT_NE(NULL, srpc1); + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 20000, 1600); + struct homa_cutoffs_hdr h = {{.sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(self->server_id), + .type = CUTOFFS}, + .unsched_cutoffs = {htonl(10), htonl(9), htonl(8), + htonl(7), htonl(6), htonl(5), htonl(4), htonl(3)}, + .cutoff_version = 400}; + ASSERT_NE(NULL, crpc); + EXPECT_EQ(10000, crpc->msgout.granted); unit_log_clear(); - EXPECT_EQ(4000, homa_grant_fifo(&self->homa)); - EXPECT_STREQ("xmit GRANT 14000@2", unit_log_get()); - EXPECT_EQ(14000, srpc1->msgin.incoming); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("", unit_log_get()); -} -TEST_F(homa_incoming, homa_remove_grantable_locked__basics) + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); + EXPECT_EQ(400, crpc->peer->cutoff_version); + EXPECT_EQ(9, crpc->peer->unsched_cutoffs[1]); + EXPECT_EQ(3, crpc->peer->unsched_cutoffs[7]); +} +TEST_F(homa_incoming, homa_cutoffs__cant_find_peer) { - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip, self->server_ip, self->client_port, - 1, 20000, 100); - ASSERT_NE(NULL, srpc); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 1, remaining 18600", - unit_log_get()); + struct homa_cutoffs_hdr h = {{.sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(self->server_id), + .type = CUTOFFS}, + .unsched_cutoffs = {htonl(10), htonl(9), htonl(8), + htonl(7), htonl(6), htonl(5), htonl(4), htonl(3)}, + .cutoff_version = 400}; + struct sk_buff *skb = mock_skb_alloc(self->server_ip, &h.common, 0, 0); + struct homa_peer *peer; - /* First time: on the list. */ - homa_remove_grantable_locked(&self->homa, srpc); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(0, self->homa.num_grantable_peers); + mock_kmalloc_errors = 1; + homa_cutoffs_pkt(skb, &self->hsk); + EXPECT_EQ(1, homa_metrics_per_cpu()->peer_kmalloc_errors); + peer = homa_peer_get(&self->hsk, self->server_ip); + ASSERT_FALSE(IS_ERR(peer)); + EXPECT_EQ(0, peer->cutoff_version); + homa_peer_release(peer); +} +#endif /* See strip.py */ - /* Second time: not on the list. */ - homa_remove_grantable_locked(&self->homa, srpc); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(0, self->homa.num_grantable_peers); -}; -TEST_F(homa_incoming, homa_remove_grantable_locked__not_head_of_peer_list) +TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_fully_received) { - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 20000, 100); - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip, self->server_ip, self->client_port, - 3, 50000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+1, - self->server_ip, self->client_port, 5, 30000, 100); - ASSERT_NE(NULL, srpc); - homa_remove_grantable_locked(&self->homa, srpc); + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, + self->server_port, self->client_id, 100, 3000); + struct homa_need_ack_hdr h = {.common = { + .sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(self->server_id), + .type = NEED_ACK}}; + + ASSERT_NE(NULL, crpc); unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 1, remaining 18600; " - "request from 197.168.0.1, id 5, remaining 28600", + mock_xmit_log_verbose = 1; + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); + EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 1234, acks", unit_log_get()); - EXPECT_EQ(2, self->homa.num_grantable_peers); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(1, homa_metrics_per_cpu()->packets_received[ + NEED_ACK - DATA]); +#endif /* See strip.py */ } -TEST_F(homa_incoming, homa_remove_grantable_locked__remove_peer_from_homa_list) +TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_not_fully_received) { - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 20000, 100); - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip+1, self->server_ip, self->client_port, - 3, 30000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+2, - self->server_ip, self->client_port, 5, 40000, 100); - ASSERT_NE(NULL, srpc); - homa_remove_grantable_locked(&self->homa, srpc); + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 100, 3000); + struct homa_need_ack_hdr h = {.common = { + .sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(self->server_id), + .type = NEED_ACK}}; + + ASSERT_NE(NULL, crpc); unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 1, remaining 18600; " - "request from 198.168.0.1, id 5, remaining 38600", - unit_log_get()); - EXPECT_EQ(2, self->homa.num_grantable_peers); + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_STREQ("xmit RESEND 1400-2999@0", unit_log_get()); + EXPECT_EQ(1, homa_metrics_per_cpu()->packets_received[ + NEED_ACK - DATA]); +#else /* See strip.py */ + EXPECT_STREQ("xmit RESEND 1400-2999", unit_log_get()); +#endif /* See strip.py */ } -TEST_F(homa_incoming, homa_remove_grantable_locked__peer_moves_down) +TEST_F(homa_incoming, homa_need_ack_pkt__rpc_not_incoming) { - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip, self->server_ip, self->client_port, - 1, 20000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 3, 40000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+1, - self->server_ip, self->client_port, 5, 30000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+2, - self->server_ip, self->client_port, 7, 40000, 100); - ASSERT_NE(NULL, srpc); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 1, remaining 18600; " - "request from 196.168.0.1, id 3, remaining 38600; " - "request from 197.168.0.1, id 5, remaining 28600; " - "request from 198.168.0.1, id 7, remaining 38600", - unit_log_get()); - EXPECT_EQ(3, self->homa.num_grantable_peers); + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 100, 3000); + struct homa_need_ack_hdr h = {.common = { + .sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(self->server_id), + .type = NEED_ACK}}; - homa_remove_grantable_locked(&self->homa, srpc); + ASSERT_NE(NULL, crpc); unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 197.168.0.1, id 5, remaining 28600; " - "request from 198.168.0.1, id 7, remaining 38600; " - "request from 196.168.0.1, id 3, remaining 38600", + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_STREQ("xmit RESEND 0--2@0", unit_log_get()); + EXPECT_EQ(1, homa_metrics_per_cpu()->packets_received[ + NEED_ACK - DATA]); +#else /* See strip.py */ + EXPECT_STREQ("xmit RESEND 0--2", unit_log_get()); +#endif /* See strip.py */ +} +TEST_F(homa_incoming, homa_need_ack_pkt__rpc_doesnt_exist) +{ + struct homa_peer *peer = homa_peer_get(&self->hsk, self->server_ip); + struct homa_need_ack_hdr h = {.common = { + .sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(self->server_id), + .type = NEED_ACK}}; + + peer->acks[0].server_port = htons(self->server_port); + peer->acks[0].client_id = cpu_to_be64(self->client_id+2); + peer->num_acks = 1; + mock_xmit_log_verbose = 1; + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); + EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 1234, acks [sp 99, id 1236]", unit_log_get()); - EXPECT_EQ(3, self->homa.num_grantable_peers); + homa_peer_release(peer); } -TEST_F(homa_incoming, homa_remove_from_grantable__basics) +TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists_no_extras) { - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 20000, 100); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("request from 196.168.0.1, id 1, remaining 18600", - unit_log_get()); - struct homa_rpc *srpc = homa_find_server_rpc(&self->hsk, - self->client_ip, self->client_port, 1); - ASSERT_NE(NULL, srpc); - homa_rpc_unlock(srpc); + struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 100, 5000); + struct homa_ack_hdr h = {.common = { + .sport = htons(self->client_port), + .dport = htons(self->hsk2.port), + .sender_id = cpu_to_be64(self->client_id), + .type = ACK}, + .num_acks = htons(0)}; - /* First time: on the list. */ - homa_remove_from_grantable(&self->homa, srpc); + ASSERT_NE(NULL, srpc); + EXPECT_EQ(1, unit_list_length(&self->hsk2.active_rpcs)); unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("", unit_log_get()); + mock_xmit_log_verbose = 1; + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); + EXPECT_EQ(0, unit_list_length(&self->hsk2.active_rpcs)); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(1, homa_metrics_per_cpu()->packets_received[ACK - DATA]); +#endif /* See strip.py */ +} +TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists_plus_extras) +{ + struct homa_rpc *srpc1 = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 100, 5000); + struct homa_rpc *srpc2 = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->client_port, + self->server_id+2, 100, 5000); + struct homa_rpc *srpc3 = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->client_port, + self->server_id+4, 100, 5000); + struct homa_ack_hdr h = {.common = { + .sport = htons(self->client_port), + .dport = htons(self->hsk2.port), + .sender_id = cpu_to_be64(self->client_id), + .type = ACK}, + .num_acks = htons(2)}; - /* Second time: not on the list (make sure it doesn't attempt to - * acquire homa_grantable_lock). */ - homa_grantable_lock(&self->homa); - homa_remove_from_grantable(&self->homa, srpc); - homa_grantable_unlock(&self->homa); + ASSERT_NE(NULL, srpc1); + ASSERT_NE(NULL, srpc2); + ASSERT_NE(NULL, srpc3); + EXPECT_EQ(3, unit_list_length(&self->hsk2.active_rpcs)); unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("", unit_log_get()); + mock_xmit_log_verbose = 1; + h.acks[0] = (struct homa_ack) {.server_port = htons(self->server_port), + .client_id = cpu_to_be64(self->server_id+1)}; + h.acks[1] = (struct homa_ack) {.server_port = htons(self->server_port), + .client_id = cpu_to_be64(self->server_id+3)}; + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); + EXPECT_EQ(0, unit_list_length(&self->hsk2.active_rpcs)); + EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc1)); + EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc2)); + EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc2)); } -TEST_F(homa_incoming, homa_remove_from_grantable__grant_to_other_message) +TEST_F(homa_incoming, homa_ack_pkt__target_rpc_doesnt_exist) { - self->homa.max_overcommit = 1; - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 1, 20000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 3, 30000, 100); + struct homa_rpc *srpc1 = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 100, 5000); + struct homa_rpc *srpc2 = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->client_port, + self->server_id+2, 100, 5000); + struct homa_ack_hdr h = {.common = { + .sport = htons(self->client_port), + .dport = htons(self->hsk2.port), + .sender_id = cpu_to_be64(self->client_id + 10), + .type = ACK}, + .num_acks = htons(2)}; - struct homa_rpc *srpc = homa_find_server_rpc(&self->hsk, - self->client_ip, self->client_port, 1); - ASSERT_NE(NULL, srpc); - homa_rpc_unlock(srpc); - homa_send_grants(&self->homa); + ASSERT_NE(NULL, srpc1); + ASSERT_NE(NULL, srpc2); + EXPECT_EQ(2, unit_list_length(&self->hsk2.active_rpcs)); unit_log_clear(); - mock_xmit_log_verbose = 1; - homa_rpc_free(srpc); - EXPECT_SUBSTR("xmit GRANT", unit_log_get()); - EXPECT_SUBSTR("id 3,", unit_log_get()); + h.acks[0] = (struct homa_ack) {.server_port = htons(self->server_port), + .client_id = cpu_to_be64(self->server_id+5)}; + h.acks[1] = (struct homa_ack) {.server_port = htons(self->server_port), + .client_id = cpu_to_be64(self->server_id+1)}; + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); + EXPECT_EQ(1, unit_list_length(&self->hsk2.active_rpcs)); + EXPECT_STREQ("OUTGOING", homa_symbol_for_state(srpc1)); + EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc2)); } TEST_F(homa_incoming, homa_rpc_abort__basics) @@ -2203,24 +2154,23 @@ TEST_F(homa_incoming, homa_rpc_abort__basics) ASSERT_NE(NULL, crpc); unit_log_clear(); homa_rpc_abort(crpc, -EFAULT); - EXPECT_EQ(1, unit_list_length(&self->hsk.ready_responses)); + EXPECT_EQ(1, unit_list_length(&self->hsk.ready_rpcs)); EXPECT_EQ(0, list_empty(&crpc->ready_links)); EXPECT_EQ(EFAULT, -crpc->error); - EXPECT_STREQ("homa_remove_from_grantable invoked; " - "sk->sk_data_ready invoked", unit_log_get()); + EXPECT_STREQ("sk->sk_data_ready invoked", unit_log_get()); } TEST_F(homa_incoming, homa_rpc_abort__socket_shutdown) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + ASSERT_NE(NULL, crpc); unit_log_clear(); self->hsk.shutdown = 1; homa_rpc_abort(crpc, -EFAULT); EXPECT_EQ(RPC_OUTGOING, crpc->state); EXPECT_EQ(EFAULT, -crpc->error); - EXPECT_STREQ("homa_remove_from_grantable invoked", unit_log_get()); self->hsk.shutdown = 0; } @@ -2235,12 +2185,13 @@ TEST_F(homa_incoming, homa_abort_rpcs__basics) struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip+1, self->server_port, self->client_id+4, 5000, 1600); + ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); ASSERT_NE(NULL, crpc3); unit_log_clear(); homa_abort_rpcs(&self->homa, self->server_ip, 0, -EPROTONOSUPPORT); - EXPECT_EQ(2, unit_list_length(&self->hsk.ready_responses)); + EXPECT_EQ(2, unit_list_length(&self->hsk.ready_rpcs)); EXPECT_EQ(0, list_empty(&crpc1->ready_links)); EXPECT_EQ(EPROTONOSUPPORT, -crpc1->error); EXPECT_EQ(0, list_empty(&crpc2->ready_links)); @@ -2249,17 +2200,15 @@ TEST_F(homa_incoming, homa_abort_rpcs__basics) } TEST_F(homa_incoming, homa_abort_rpcs__multiple_sockets) { - struct homa_sock hsk1, hsk2; struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 5000, 1600); struct homa_rpc *crpc2, *crpc3; - mock_sock_init(&hsk1, &self->homa, self->server_port); - mock_sock_init(&hsk2, &self->homa, self->server_port+1); - crpc2 = unit_client_rpc(&hsk1, UNIT_OUTGOING, self->client_ip, + + crpc2 = unit_client_rpc(&self->hsk2, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id+2, 5000, 1600); - crpc3 = unit_client_rpc(&hsk1, UNIT_OUTGOING, self->client_ip, + crpc3 = unit_client_rpc(&self->hsk2, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id+4, 5000, 1600); ASSERT_NE(NULL, crpc1); @@ -2267,16 +2216,14 @@ TEST_F(homa_incoming, homa_abort_rpcs__multiple_sockets) ASSERT_NE(NULL, crpc3); unit_log_clear(); homa_abort_rpcs(&self->homa, self->server_ip, 0, -EPROTONOSUPPORT); - EXPECT_EQ(1, unit_list_length(&self->hsk.ready_responses)); + EXPECT_EQ(1, unit_list_length(&self->hsk.ready_rpcs)); EXPECT_EQ(0, list_empty(&crpc1->ready_links)); EXPECT_EQ(EPROTONOSUPPORT, -crpc1->error); EXPECT_EQ(0, list_empty(&crpc2->ready_links)); EXPECT_EQ(EPROTONOSUPPORT, -crpc2->error); EXPECT_EQ(0, list_empty(&crpc3->ready_links)); - EXPECT_EQ(2, unit_list_length(&hsk1.active_rpcs)); - EXPECT_EQ(2, unit_list_length(&hsk1.ready_responses)); - homa_sock_shutdown(&hsk1); - homa_sock_shutdown(&hsk2); + EXPECT_EQ(2, unit_list_length(&self->hsk2.active_rpcs)); + EXPECT_EQ(2, unit_list_length(&self->hsk2.ready_rpcs)); } TEST_F(homa_incoming, homa_abort_rpcs__select_addr) { @@ -2289,13 +2236,14 @@ TEST_F(homa_incoming, homa_abort_rpcs__select_addr) struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip+2, self->server_port, self->client_id+4, 5000, 1600); + ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); ASSERT_NE(NULL, crpc3); unit_log_clear(); homa_abort_rpcs(&self->homa, self->server_ip, self->server_port, -ENOTCONN); - EXPECT_EQ(1, unit_list_length(&self->hsk.ready_responses)); + EXPECT_EQ(1, unit_list_length(&self->hsk.ready_rpcs)); EXPECT_EQ(0, list_empty(&crpc1->ready_links)); EXPECT_EQ(RPC_OUTGOING, crpc2->state); EXPECT_EQ(RPC_OUTGOING, crpc3->state); @@ -2311,13 +2259,14 @@ TEST_F(homa_incoming, homa_abort_rpcs__select_port) struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id+4, 5000, 1600); + ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); ASSERT_NE(NULL, crpc3); unit_log_clear(); homa_abort_rpcs(&self->homa, self->server_ip, self->server_port, -ENOTCONN); - EXPECT_EQ(2, unit_list_length(&self->hsk.ready_responses)); + EXPECT_EQ(2, unit_list_length(&self->hsk.ready_rpcs)); EXPECT_EQ(0, list_empty(&crpc1->ready_links)); EXPECT_EQ(ENOTCONN, -crpc1->error); EXPECT_EQ(RPC_OUTGOING, crpc2->state); @@ -2335,6 +2284,7 @@ TEST_F(homa_incoming, homa_abort_rpcs__any_port) struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id+4, 5000, 1600); + ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); ASSERT_NE(NULL, crpc3); @@ -2349,18 +2299,20 @@ TEST_F(homa_incoming, homa_abort_rpcs__ignore_dead_rpcs) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 5000, 1600); + ASSERT_NE(NULL, crpc); - homa_rpc_free(crpc); + homa_rpc_end(crpc); EXPECT_EQ(RPC_DEAD, crpc->state); unit_log_clear(); homa_abort_rpcs(&self->homa, self->server_ip, 0, -ENOTCONN); - EXPECT_EQ(0, crpc->error); + EXPECT_EQ(-EINVAL, crpc->error); } TEST_F(homa_incoming, homa_abort_rpcs__free_server_rpc) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->client_port, - self->server_id, 20000, 100); + self->server_id, 20000, 100); + ASSERT_NE(NULL, srpc); unit_log_clear(); homa_abort_rpcs(&self->homa, self->client_ip, 0, 0); @@ -2377,7 +2329,8 @@ TEST_F(homa_incoming, homa_abort_sock_rpcs__basics) self->server_port+1, self->client_id+2, 5000, 1600); struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, - self->server_id, 20000, 100); + self->server_id, 20000, 100); + ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); ASSERT_NE(NULL, srpc); @@ -2406,12 +2359,13 @@ TEST_F(homa_incoming, homa_abort_sock_rpcs__rpc_already_dead) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 5000, 1600); + ASSERT_NE(NULL, crpc); - homa_rpc_free(crpc); + homa_rpc_end(crpc); EXPECT_EQ(RPC_DEAD, crpc->state); unit_log_clear(); homa_abort_sock_rpcs(&self->hsk, -ENOTCONN); - EXPECT_EQ(0, crpc->error); + EXPECT_EQ(-EINVAL, crpc->error); } TEST_F(homa_incoming, homa_abort_sock_rpcs__free_rpcs) { @@ -2421,6 +2375,7 @@ TEST_F(homa_incoming, homa_abort_sock_rpcs__free_rpcs) struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port+1, self->client_id+2, 5000, 1600); + ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); unit_log_clear(); @@ -2430,605 +2385,360 @@ TEST_F(homa_incoming, homa_abort_sock_rpcs__free_rpcs) EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } -TEST_F(homa_incoming, homa_register_interests__id_not_for_client_rpc) -{ - int result; - result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_RESPONSE, 45); - EXPECT_EQ(EINVAL, -result); -} -TEST_F(homa_incoming, homa_register_interests__no_rpc_for_id) -{ - int result; - result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_RESPONSE, 44); - EXPECT_EQ(EINVAL, -result); -} -TEST_F(homa_incoming, homa_register_interests__id_already_has_interest) +TEST_F(homa_incoming, homa_wait_private__rpc_not_private) { - struct homa_interest interest; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); - crpc->interest = &interest; - int result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_RESPONSE, self->client_id); - EXPECT_EQ(EINVAL, -result); - crpc->interest = NULL; + ASSERT_NE(NULL, crpc); + EXPECT_EQ(EINVAL, -homa_wait_private(crpc, 0)); } -TEST_F(homa_incoming, homa_register_interests__return_response_by_id) +TEST_F(homa_incoming, homa_wait_private__rpc_has_error) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); - int result = homa_register_interests(&self->interest, &self->hsk, - 0, self->client_id); - EXPECT_EQ(0, result); - EXPECT_EQ(crpc, (struct homa_rpc *) - atomic_long_read(&self->interest.ready_rpc)); + ASSERT_NE(NULL, crpc); + ASSERT_EQ(1, test_bit(RPC_PKTS_READY, &crpc->flags)); + set_bit(RPC_PRIVATE, &crpc->flags); + crpc->error = -ENOENT; + homa_rpc_lock(crpc); + EXPECT_EQ(0, -homa_wait_private(crpc, 0)); homa_rpc_unlock(crpc); + EXPECT_EQ(1, test_bit(RPC_PKTS_READY, &crpc->flags)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_none)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_block)); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_fast)); } -TEST_F(homa_incoming, homa_register_interests__socket_shutdown) -{ - int result; - self->hsk.shutdown = 1; - result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_RESPONSE, 0); - EXPECT_EQ(ESHUTDOWN, -result); - self->hsk.shutdown = 0; -} -TEST_F(homa_incoming, homa_register_interests__specified_id_has_packets) +TEST_F(homa_incoming, homa_wait_private__copy_to_user_fails) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); - int result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_REQUEST, crpc->id); - EXPECT_EQ(0, result); - EXPECT_EQ(crpc, (struct homa_rpc *) - atomic_long_read(&self->interest.ready_rpc)); + ASSERT_NE(NULL, crpc); + ASSERT_EQ(1, test_bit(RPC_PKTS_READY, &crpc->flags)); + set_bit(RPC_PRIVATE, &crpc->flags); + mock_copy_data_errors = 1; + homa_rpc_lock(crpc); + EXPECT_EQ(0, -homa_wait_private(crpc, 0)); + EXPECT_EQ(-EFAULT, crpc->error); homa_rpc_unlock(crpc); } -TEST_F(homa_incoming, homa_register_interests__specified_id_has_error) +TEST_F(homa_incoming, homa_wait_private__available_immediately) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); - crpc->error = -EFAULT; - int result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_NONBLOCKING, crpc->id); - EXPECT_EQ(0, result); - EXPECT_EQ(crpc, (struct homa_rpc *) - atomic_long_read(&self->interest.ready_rpc)); + ASSERT_NE(NULL, crpc); + ASSERT_EQ(1, test_bit(RPC_PKTS_READY, &crpc->flags)); + set_bit(RPC_PRIVATE, &crpc->flags); + homa_rpc_lock(crpc); + EXPECT_EQ(0, homa_wait_private(crpc, 0)); homa_rpc_unlock(crpc); + EXPECT_EQ(1, test_bit(RPC_PRIVATE, &crpc->flags)); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_none)); } -TEST_F(homa_incoming, homa_register_interests__specified_id_not_ready) +TEST_F(homa_incoming, homa_wait_private__nonblocking) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + ASSERT_NE(NULL, crpc); + set_bit(RPC_PRIVATE, &crpc->flags); - int result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_REQUEST, crpc->id); - EXPECT_EQ(0, result); - EXPECT_EQ(NULL, (struct homa_rpc *) - atomic_long_read(&self->interest.ready_rpc)); + homa_rpc_lock(crpc); + EXPECT_EQ(EAGAIN, -homa_wait_private(crpc, 1)); + homa_rpc_unlock(crpc); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_none)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_block)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_fast)); } -TEST_F(homa_incoming, homa_register_interests__return_queued_response) +TEST_F(homa_incoming, homa_wait_private__signal_notify_race) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 20000, 1000); + ASSERT_NE(NULL, crpc); + set_bit(RPC_PRIVATE, &crpc->flags); + IF_NO_STRIP(self->homa.poll_cycles = 0); + unit_hook_register(handoff_hook); + hook_rpc = crpc; + hook_count = 2; + mock_prepare_to_wait_errors = 1; - int result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_RESPONSE, 0); - EXPECT_EQ(0, result); - EXPECT_EQ(crpc, (struct homa_rpc *) - atomic_long_read(&self->interest.ready_rpc)); - EXPECT_EQ(LIST_POISON1, self->interest.request_links.next); - EXPECT_EQ(LIST_POISON1, self->interest.response_links.next); + homa_rpc_lock(crpc); + EXPECT_EQ(0, -homa_wait_private(crpc, 0)); homa_rpc_unlock(crpc); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_none)); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_block)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_fast)); + EXPECT_EQ(0, mock_prepare_to_wait_errors); } -TEST_F(homa_incoming, homa_register_interests__return_queued_request) -{ - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, - self->client_ip, self->server_ip, self->client_port, - 1, 20000, 100); - ASSERT_NE(NULL, srpc); - int result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_RESPONSE, 0); - EXPECT_EQ(0, result); - EXPECT_EQ(srpc, (struct homa_rpc *) - atomic_long_read(&self->interest.ready_rpc)); - EXPECT_EQ(LIST_POISON1, self->interest.request_links.next); - EXPECT_EQ(LIST_POISON1, self->interest.response_links.next); - homa_rpc_unlock(srpc); -} -TEST_F(homa_incoming, homa_register_interests__call_sk_data_ready) +TEST_F(homa_incoming, homa_wait_shared__socket_already_shutdown) { - struct homa_rpc *srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 20000, 100); - struct homa_rpc *srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, - self->client_ip, self->server_ip, self->client_port, - self->server_id+2, 20000, 100); + struct homa_rpc *rpc; - // First time should call sk_data_ready (for 2nd RPC). - unit_log_clear(); - int result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_RESPONSE, 0); - EXPECT_EQ(0, result); - EXPECT_EQ(srpc1, (struct homa_rpc *) - atomic_long_read(&self->interest.ready_rpc)); - EXPECT_STREQ("sk->sk_data_ready invoked", unit_log_get()); - homa_rpc_unlock(srpc1); + self->hsk.shutdown = 1; - // Second time shouldn't call sk_data_ready (no more RPCs). - unit_log_clear(); - result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_RESPONSE - |HOMA_RECVMSG_NONBLOCKING, 0); - EXPECT_EQ(0, result); - EXPECT_EQ(srpc2, (struct homa_rpc *) - atomic_long_read(&self->interest.ready_rpc)); - EXPECT_STREQ("", unit_log_get()); - homa_rpc_unlock(srpc2); + rpc = homa_wait_shared(&self->hsk, 0); + EXPECT_TRUE(IS_ERR(rpc)); + EXPECT_EQ(ESHUTDOWN, -PTR_ERR(rpc)); + self->hsk.shutdown = 0; } - -TEST_F(homa_incoming, homa_wait_for_message__rpc_from_register_interests) +TEST_F(homa_incoming, homa_wait_shared__rpc_already_ready) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + struct homa_rpc *rpc; + ASSERT_NE(NULL, crpc); + ASSERT_EQ(1, test_bit(RPC_PKTS_READY, &crpc->flags)); - struct homa_rpc *rpc = homa_wait_for_message(&self->hsk, - HOMA_RECVMSG_RESPONSE|HOMA_RECVMSG_NONBLOCKING, - self->client_id); + rpc = homa_wait_shared(&self->hsk, 0); + ASSERT_FALSE(IS_ERR(rpc)); EXPECT_EQ(crpc, rpc); - homa_rpc_unlock(crpc); + EXPECT_EQ(0, crpc->msgin.packets.qlen); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_none)); + homa_rpc_put(rpc); + homa_rpc_unlock(rpc); } -TEST_F(homa_incoming, homa_wait_for_message__error_from_register_interests) +TEST_F(homa_incoming, homa_wait_shared__multiple_rpcs_already_ready) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); - - self->hsk.shutdown = 1; - struct homa_rpc *rpc = homa_wait_for_message(&self->hsk, - HOMA_RECVMSG_RESPONSE|HOMA_RECVMSG_NONBLOCKING, - self->client_id); - EXPECT_EQ(ESHUTDOWN, -PTR_ERR(rpc)); - self->hsk.shutdown = 0; -} -TEST_F(homa_incoming, homa_wait_for_message__rpc_arrives_while_polling) -{ + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, + self->server_port, self->client_id+2, 1000, 1600); struct homa_rpc *rpc; - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc1); - hook_rpc = crpc1; - poll_count = 5; - self->homa.poll_cycles = 1000000; - unit_hook_register(poll_hook); + ASSERT_NE(NULL, crpc); + ASSERT_NE(NULL, crpc2); + unit_log_clear(); - rpc = homa_wait_for_message(&self->hsk, 0, self->client_id); - EXPECT_EQ(crpc1, rpc); - EXPECT_EQ(NULL, crpc1->interest); - EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); - EXPECT_EQ(0, self->hsk.dead_skbs); + rpc = homa_wait_shared(&self->hsk, 0); + ASSERT_FALSE(IS_ERR(rpc)); + EXPECT_EQ(crpc, rpc); + homa_rpc_put(rpc); homa_rpc_unlock(rpc); + EXPECT_SUBSTR("sk->sk_data_ready invoked", unit_log_get()); } -TEST_F(homa_incoming, homa_wait_for_message__nothing_ready_nonblocking) +TEST_F(homa_incoming, homa_wait_shared__nonblocking) { struct homa_rpc *rpc; - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); - unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 20000, 1600); - ASSERT_NE(NULL, crpc1); - rpc = homa_wait_for_message(&self->hsk, HOMA_RECVMSG_NONBLOCKING, - self->client_id); + rpc = homa_wait_shared(&self->hsk, 1); + EXPECT_TRUE(IS_ERR(rpc)); EXPECT_EQ(EAGAIN, -PTR_ERR(rpc)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_none)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_block)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_fast)); } -TEST_F(homa_incoming, homa_wait_for_message__rpc_arrives_while_sleeping) -{ - struct homa_rpc *rpc; - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc1); - - /* Also, check to see that reaping occurs before sleeping. */ - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 20000, 20000); - self->homa.reap_limit = 5; - homa_rpc_free(crpc2); - EXPECT_EQ(30, self->hsk.dead_skbs); - unit_log_clear(); - - hook_rpc = crpc1; - unit_hook_register(handoff_hook); - rpc = homa_wait_for_message(&self->hsk, 0, self->client_id); - EXPECT_EQ(crpc1, rpc); - EXPECT_EQ(NULL, crpc1->interest); - EXPECT_STREQ("reaped 1236; wake_up_process pid 0; 0 in ready_requests, " - "0 in ready_responses, 0 in request_interests, " - "0 in response_interests", unit_log_get()); - EXPECT_EQ(0, self->hsk.dead_skbs); - homa_rpc_unlock(rpc); -} -TEST_F(homa_incoming, homa_wait_for_message__rpc_arrives_after_giving_up) +TEST_F(homa_incoming, homa_wait_shared__reap_when_nonblocking) { + struct homa_rpc *crpc; struct homa_rpc *rpc; - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); - hook_rpc = crpc; - unit_hook_register(handoff_hook2); - unit_log_clear(); - rpc = homa_wait_for_message(&self->hsk, - HOMA_RECVMSG_NONBLOCKING|HOMA_RECVMSG_RESPONSE, 0); - ASSERT_EQ(crpc, rpc); - EXPECT_EQ(NULL, crpc->interest); - EXPECT_EQ(ETIMEDOUT, -rpc->error); - homa_rpc_unlock(rpc); -} -TEST_F(homa_incoming, homa_wait_for_message__handoff_rpc_then_delete_after_giving_up) -{ - // A key thing this test does it to ensure that RPC_HANDING_OFF - // gets cleared even though the RPC has been deleted. - struct homa_rpc *rpc; - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 14 * 1400 + 1, 1600); ASSERT_NE(NULL, crpc); + homa_rpc_end(crpc); + EXPECT_EQ(15, self->hsk.dead_skbs); - // Prevent the RPC from being reaped during the test. - atomic_or(RPC_COPYING_TO_USER, &crpc->flags); - - hook_rpc = crpc; - hook3_count = 0; - unit_hook_register(handoff_hook3); - unit_log_clear(); - rpc = homa_wait_for_message(&self->hsk, - HOMA_RECVMSG_NONBLOCKING|HOMA_RECVMSG_RESPONSE, 0); + rpc = homa_wait_shared(&self->hsk, 1); + EXPECT_TRUE(IS_ERR(rpc)); EXPECT_EQ(EAGAIN, -PTR_ERR(rpc)); - EXPECT_EQ(RPC_COPYING_TO_USER, atomic_read(&crpc->flags)); - EXPECT_EQ(RPC_DEAD, crpc->state); - atomic_andnot(RPC_COPYING_TO_USER, &crpc->flags); + EXPECT_EQ(5, self->hsk.dead_skbs); } -TEST_F(homa_incoming, homa_wait_for_message__explicit_rpc_deleted_while_sleeping) +TEST_F(homa_incoming, homa_wait_shared__signal_race_with_handoff) { - struct homa_rpc *rpc; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); - unit_log_clear(); + struct homa_rpc *rpc; + ASSERT_NE(NULL, crpc); + crpc->error = -ENOENT; + unit_hook_register(handoff_hook); hook_rpc = crpc; - unit_hook_register(delete_hook); - rpc = homa_wait_for_message(&self->hsk, HOMA_RECVMSG_RESPONSE, - self->client_id); - EXPECT_EQ(EINVAL, -PTR_ERR(rpc)); -} -TEST_F(homa_incoming, homa_wait_for_message__rpc_deleted_after_matching) -{ - /* Arrange for 2 RPCs to be ready, but delete the first one after - * it has matched; this should cause the second one to be matched. - */ - struct homa_rpc *rpc; - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc1); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 20000, 1600); - ASSERT_NE(NULL, crpc2); - unit_log_clear(); + hook_count = 2; + mock_prepare_to_wait_errors = 1; - hook_rpc = crpc1; - unit_hook_register(match_delete_hook); - rpc = homa_wait_for_message(&self->hsk, - HOMA_RECVMSG_RESPONSE|HOMA_RECVMSG_NONBLOCKING, 0); - EXPECT_EQ(RPC_DEAD, crpc1->state); - EXPECT_EQ(crpc2, rpc); + rpc = homa_wait_shared(&self->hsk, 0); + EXPECT_EQ(crpc, rpc); + EXPECT_EQ(ENOENT, -rpc->error); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_block)); + homa_rpc_put(rpc); homa_rpc_unlock(rpc); } -TEST_F(homa_incoming, homa_wait_for_message__socket_shutdown_while_sleeping) +TEST_F(homa_incoming, homa_wait_shared__socket_shutdown_while_blocked) { struct homa_rpc *rpc; - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); - unit_log_clear(); - hook_hsk = &self->hsk; - unit_hook_register(shutdown_hook); - rpc = homa_wait_for_message(&self->hsk, - HOMA_RECVMSG_RESPONSE|HOMA_RECVMSG_REQUEST, 0); + unit_hook_register(wait_hook4); + hook_shutdown_hsk = &self->hsk; + hook_count = 4; + + rpc = homa_wait_shared(&self->hsk, 0); + EXPECT_TRUE(IS_ERR(rpc)); EXPECT_EQ(ESHUTDOWN, -PTR_ERR(rpc)); + EXPECT_EQ(1, self->hsk.shutdown); + self->hsk.shutdown = 0; + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_none)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_block)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_fast)); } -TEST_F(homa_incoming, homa_wait_for_message__copy_to_user) +TEST_F(homa_incoming, homa_wait_shared__copy_to_user_fails) { - struct homa_rpc *rpc; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); - EXPECT_EQ(0, -homa_pool_init(&self->hsk.buffer_pool, &self->homa, - (char *) 0x1000000, 100*HOMA_BPAGE_SIZE)); - mock_copy_to_user_dont_copy = -1; - unit_log_clear(); - - hook_hsk = &self->hsk; - rpc = homa_wait_for_message(&self->hsk, - HOMA_RECVMSG_RESPONSE|HOMA_RECVMSG_NONBLOCKING, 0); - EXPECT_EQ(EAGAIN, -PTR_ERR(rpc)); - EXPECT_EQ(0, atomic_read(&crpc->flags)); - EXPECT_EQ(1400, crpc->msgin.copied_out); -} -TEST_F(homa_incoming, homa_wait_for_message__copy_to_user_fails) -{ struct homa_rpc *rpc; - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); + ASSERT_NE(NULL, crpc); - /* We don't set up a buffer pool, so copy_to_user will fail. */ - unit_log_clear(); + ASSERT_EQ(1, test_bit(RPC_PKTS_READY, &crpc->flags)); + mock_copy_data_errors = 1; - hook_hsk = &self->hsk; - rpc = homa_wait_for_message(&self->hsk, - HOMA_RECVMSG_RESPONSE|HOMA_RECVMSG_NONBLOCKING, 0); - ASSERT_FALSE(IS_ERR(rpc)); + rpc = homa_wait_shared(&self->hsk, 0); EXPECT_EQ(crpc, rpc); - EXPECT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags)); - EXPECT_EQ(ENOMEM, -rpc->error); + EXPECT_EQ(EFAULT, -rpc->error); + homa_rpc_put(rpc); homa_rpc_unlock(rpc); } -TEST_F(homa_incoming, homa_wait_for_message__message_complete) +TEST_F(homa_incoming, homa_wait_shared__rpc_has_error) { - struct homa_rpc *rpc; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 2000); + self->server_port, self->client_id, 20000, 1600); + struct homa_rpc *rpc; + ASSERT_NE(NULL, crpc); - EXPECT_EQ(0, -homa_pool_init(&self->hsk.buffer_pool, &self->homa, - (char *) 0x1000000, 100*HOMA_BPAGE_SIZE)); - mock_copy_to_user_dont_copy = -1; - unit_log_clear(); + EXPECT_EQ(2, crpc->msgin.packets.qlen); + crpc->error = -ENOENT; - hook_hsk = &self->hsk; - rpc = homa_wait_for_message(&self->hsk, - HOMA_RECVMSG_RESPONSE|HOMA_RECVMSG_NONBLOCKING, 0); - ASSERT_FALSE(IS_ERR(rpc)); + rpc = homa_wait_shared(&self->hsk, 0); EXPECT_EQ(crpc, rpc); - EXPECT_EQ(0, atomic_read(&crpc->flags)); - EXPECT_EQ(2000, crpc->msgin.copied_out); + EXPECT_EQ(2, crpc->msgin.packets.qlen); + homa_rpc_put(rpc); homa_rpc_unlock(rpc); } -TEST_F(homa_incoming, homa_wait_for_message__signal) -{ - struct homa_rpc *rpc; - - mock_signal_pending = 1; - rpc = homa_wait_for_message(&self->hsk, HOMA_RECVMSG_REQUEST, 0); - EXPECT_EQ(EINTR, -PTR_ERR(rpc)); -} - -TEST_F(homa_incoming, homa_rpc_handoff__handoff_already_in_progress) +TEST_F(homa_incoming, homa_wait_shared__rpc_dead) { - struct homa_interest interest; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, + self->server_port, self->client_id, 20000, 1600); + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + struct homa_rpc *rpc; + ASSERT_NE(NULL, crpc); - EXPECT_EQ(NULL, crpc->interest); - unit_log_clear(); + ASSERT_NE(NULL, crpc2); + homa_rpc_end(crpc); - homa_interest_init(&interest); - interest.thread = &mock_task; - interest.reg_rpc = crpc; - crpc->interest = &interest; - atomic_or(RPC_HANDING_OFF, &crpc->flags); - homa_rpc_handoff(crpc); - crpc->interest = NULL; - EXPECT_EQ(NULL, (struct homa_rpc *) - atomic_long_read(&interest.ready_rpc)); - EXPECT_STREQ("", unit_log_get()); - atomic_andnot(RPC_HANDING_OFF, &crpc->flags); + rpc = homa_wait_shared(&self->hsk, 0); + EXPECT_EQ(crpc2, rpc); + homa_rpc_put(rpc); + homa_rpc_unlock(rpc); } -TEST_F(homa_incoming, homa_rpc_handoff__rpc_already_enqueued) + +TEST_F(homa_incoming, homa_rpc_handoff__private_rpc) { struct homa_interest interest; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); - EXPECT_EQ(NULL, crpc->interest); - unit_log_clear(); - /* First handoff enqueues the RPC. */ - homa_rpc_handoff(crpc); - EXPECT_FALSE(list_empty(&crpc->ready_links)); + ASSERT_NE(NULL, crpc); + set_bit(RPC_PRIVATE, &crpc->flags); + homa_interest_init_private(&interest, crpc); + mock_log_wakeups = 1; unit_log_clear(); - /* Second handoff does nothing, even though an interest is available. */ - - homa_interest_init(&interest); - interest.thread = &mock_task; - interest.reg_rpc = crpc; - crpc->interest = &interest; - atomic_or(RPC_HANDING_OFF, &crpc->flags); homa_rpc_handoff(crpc); - crpc->interest = NULL; - EXPECT_EQ(NULL, (struct homa_rpc *) - atomic_long_read(&interest.ready_rpc)); - EXPECT_STREQ("", unit_log_get()); - atomic_andnot(RPC_HANDING_OFF, &crpc->flags); + EXPECT_STREQ("wake_up", unit_log_get()); + EXPECT_EQ(1, atomic_read(&interest.ready)); + EXPECT_TRUE(list_empty(&self->hsk.ready_rpcs)); + homa_interest_unlink_private(&interest); } -TEST_F(homa_incoming, homa_rpc_handoff__interest_on_rpc) +TEST_F(homa_incoming, homa_rpc_handoff__socket_shutdown) { - struct homa_interest interest; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + ASSERT_NE(NULL, crpc); - EXPECT_EQ(NULL, crpc->interest); unit_log_clear(); - homa_interest_init(&interest); - interest.thread = &mock_task; - interest.reg_rpc = crpc; - crpc->interest = &interest; + self->hsk.shutdown = 1; homa_rpc_handoff(crpc); - crpc->interest = NULL; - EXPECT_EQ(crpc, (struct homa_rpc *) - atomic_long_read(&interest.ready_rpc)); - EXPECT_EQ(NULL, interest.reg_rpc); - EXPECT_EQ(NULL, crpc->interest); - EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); - atomic_andnot(RPC_HANDING_OFF, &crpc->flags); + self->hsk.shutdown = 0; + EXPECT_TRUE(list_empty(&self->hsk.ready_rpcs)); } -TEST_F(homa_incoming, homa_rpc_handoff__response_interests) +TEST_F(homa_incoming, homa_rpc_handoff__handoff_to_shared_interest) { - struct homa_interest interest; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + struct homa_interest interest1, interest2; + ASSERT_NE(NULL, crpc); - EXPECT_EQ(NULL, crpc->interest); + homa_interest_init_shared(&interest1, &self->hsk); + homa_interest_init_shared(&interest2, &self->hsk); + EXPECT_EQ(2, unit_list_length(&self->hsk.interests)); unit_log_clear(); - homa_interest_init(&interest); - interest.thread = &mock_task; - list_add_tail(&interest.response_links, &self->hsk.response_interests); homa_rpc_handoff(crpc); - EXPECT_EQ(crpc, (struct homa_rpc *) - atomic_long_read(&interest.ready_rpc)); - EXPECT_EQ(0, unit_list_length(&self->hsk.response_interests)); - EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); - atomic_andnot(RPC_HANDING_OFF, &crpc->flags); + EXPECT_EQ(1, unit_list_length(&self->hsk.interests)); + EXPECT_EQ(0, atomic_read(&interest1.ready)); + EXPECT_EQ(1, atomic_read(&interest2.ready)); + EXPECT_EQ(crpc, interest2.rpc); + homa_rpc_put(crpc); + homa_interest_unlink_shared(&interest1); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->handoffs_thread_waiting)); } -TEST_F(homa_incoming, homa_rpc_handoff__queue_on_ready_responses) +TEST_F(homa_incoming, homa_rpc_handoff__queue_rpc_on_socket) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + ASSERT_NE(NULL, crpc); unit_log_clear(); + mock_log_wakeups = 1; + /* First call should queue RPC. */ homa_rpc_handoff(crpc); EXPECT_STREQ("sk->sk_data_ready invoked", unit_log_get()); - EXPECT_EQ(1, unit_list_length(&self->hsk.ready_responses)); -} -TEST_F(homa_incoming, homa_rpc_handoff__request_interests) -{ - struct homa_interest interest; - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 20000, 100); - ASSERT_NE(NULL, srpc); - unit_log_clear(); - - homa_interest_init(&interest); - interest.thread = &mock_task; - list_add_tail(&interest.request_links, &self->hsk.request_interests); - homa_rpc_handoff(srpc); - EXPECT_EQ(srpc, (struct homa_rpc *) - atomic_long_read(&interest.ready_rpc)); - EXPECT_EQ(0, unit_list_length(&self->hsk.request_interests)); - EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); - atomic_andnot(RPC_HANDING_OFF, &srpc->flags); -} -TEST_F(homa_incoming, homa_rpc_handoff__queue_on_ready_requests) -{ - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - 1, 20000, 100); - ASSERT_NE(NULL, srpc); - unit_log_clear(); + EXPECT_FALSE(list_empty(&self->hsk.ready_rpcs)); - homa_rpc_handoff(srpc); - EXPECT_STREQ("sk->sk_data_ready invoked", unit_log_get()); - EXPECT_EQ(1, unit_list_length(&self->hsk.ready_requests)); -} -TEST_F(homa_incoming, homa_rpc_handoff__detach_interest) -{ - struct homa_interest interest; - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); - EXPECT_EQ(NULL, crpc->interest); + /* Calling again should do nothing (already queued). */ unit_log_clear(); - - homa_interest_init(&interest); - interest.thread = &mock_task; - interest.reg_rpc = crpc; - crpc->interest = &interest; - list_add_tail(&interest.response_links, &self->hsk.response_interests); - list_add_tail(&interest.request_links, &self->hsk.request_interests); - EXPECT_EQ(1, unit_list_length(&self->hsk.response_interests)); - EXPECT_EQ(1, unit_list_length(&self->hsk.request_interests)); - homa_rpc_handoff(crpc); - crpc->interest = NULL; - EXPECT_EQ(crpc, (struct homa_rpc *) - atomic_long_read(&interest.ready_rpc)); - EXPECT_EQ(NULL, interest.reg_rpc); - EXPECT_EQ(NULL, crpc->interest); - EXPECT_EQ(0, unit_list_length(&self->hsk.response_interests)); - EXPECT_EQ(0, unit_list_length(&self->hsk.request_interests)); - atomic_andnot(RPC_HANDING_OFF, &crpc->flags); + EXPECT_STREQ("", unit_log_get()); + EXPECT_FALSE(list_empty(&self->hsk.ready_rpcs)); } -TEST_F(homa_incoming, homa_incoming_sysctl_changed__grant_nonfifo) -{ - cpu_khz = 2000000; - self->homa.poll_usecs = 40; - homa_incoming_sysctl_changed(&self->homa); - EXPECT_EQ(80000, self->homa.poll_cycles); -} -TEST_F(homa_incoming, homa_incoming_sysctl_changed__poll_cycles) +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_incoming, homa_incoming_sysctl_changed__convert_usec_to_cycles) { - self->homa.fifo_grant_increment = 10000; - self->homa.grant_fifo_fraction = 0; - homa_incoming_sysctl_changed(&self->homa); - EXPECT_EQ(0, self->homa.grant_nonfifo); - - self->homa.grant_fifo_fraction = 100; - homa_incoming_sysctl_changed(&self->homa); - EXPECT_EQ(90000, self->homa.grant_nonfifo); - - self->homa.grant_fifo_fraction = 500; - homa_incoming_sysctl_changed(&self->homa); - EXPECT_EQ(10000, self->homa.grant_nonfifo); - - self->homa.grant_fifo_fraction = 2000; + self->homa.poll_usecs = 27; + self->homa.busy_usecs = 53; + self->homa.gro_busy_usecs = 140; + self->homa.bpage_lease_usecs = 700; homa_incoming_sysctl_changed(&self->homa); - EXPECT_EQ(10000, self->homa.grant_nonfifo); + EXPECT_EQ(27000, self->homa.poll_cycles); + EXPECT_EQ(53000, self->homa.busy_cycles); + EXPECT_EQ(140000, self->homa.gro_busy_cycles); + EXPECT_EQ(700000, self->homa.bpage_lease_cycles); } +#endif /* See strip.py */ diff --git a/test/unit_homa_interest.c b/test/unit_homa_interest.c new file mode 100644 index 00000000..7a2ab0d1 --- /dev/null +++ b/test/unit_homa_interest.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +#include "homa_impl.h" +#include "homa_interest.h" +#include "homa_sock.h" + +#ifndef __STRIP__ /* See strip.py */ +#include "homa_offload.h" +#endif /* See strip.py */ + +#define KSELFTEST_NOT_MAIN 1 +#include "kselftest_harness.h" +#include "ccutils.h" +#include "mock.h" +#include "utils.h" + +static int hook_count; +static struct homa_interest *hook_interest; + +#ifndef __STRIP__ /* See strip.py */ +static void log_hook(char *id) +{ + if (strcmp(id, "unlock") == 0 || + strcmp(id, "schedule") == 0) { + unit_log_printf("; ", "%s", id); + } +} +#endif /* See strip.py */ + +static void notify_hook(char *id) +{ + if (strcmp(id, "schedule") != 0 && + strcmp(id, "do_wait_intr_irq") != 0 && + strcmp(id, "prepare_to_wait") != 0) + return; + if (hook_count <= 0) + return; + hook_count--; + if (hook_count != 0) + return; + atomic_set(&hook_interest->ready, 1); +} + +FIXTURE(homa_interest) { + struct homa homa; + struct homa_net *hnet; + struct homa_sock hsk; + struct in6_addr client_ip; + int client_port; + struct in6_addr server_ip; + int server_port; + u64 client_id; + u64 server_id; + union sockaddr_in_union server_addr; +}; +FIXTURE_SETUP(homa_interest) +{ + homa_init(&self->homa); + self->hnet = mock_hnet(0, &self->homa); + mock_sock_init(&self->hsk, self->hnet, 0); + self->client_ip = unit_get_in_addr("196.168.0.1"); + self->client_port = 40000; + self->server_ip = unit_get_in_addr("1.2.3.4"); + self->server_port = 99; + self->client_id = 1234; + self->server_id = 1235; + self->server_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; + self->server_addr.in6.sin6_addr = self->server_ip; + self->server_addr.in6.sin6_port = htons(self->server_port); + unit_log_clear(); +} +FIXTURE_TEARDOWN(homa_interest) +{ + homa_destroy(&self->homa); + unit_teardown(); +} + +TEST_F(homa_interest, homa_interest_init_shared_and_unlink_shared) +{ + struct homa_interest interests[4]; + int i; + + for (i = 0; i < 4; i++) { + homa_interest_init_shared(&interests[i], &self->hsk); + EXPECT_EQ(i + 1, unit_list_length(&self->hsk.interests)); + } + EXPECT_EQ(3, list_first_entry(&self->hsk.interests, + struct homa_interest, links) + - interests); + homa_interest_unlink_shared(&interests[1]); + EXPECT_EQ(3, unit_list_length(&self->hsk.interests)); + homa_interest_unlink_shared(&interests[0]); + EXPECT_EQ(2, unit_list_length(&self->hsk.interests)); + homa_interest_unlink_shared(&interests[3]); + EXPECT_EQ(1, unit_list_length(&self->hsk.interests)); + homa_interest_unlink_shared(&interests[2]); + EXPECT_EQ(0, unit_list_length(&self->hsk.interests)); +} + +TEST_F(homa_interest, homa_interest_init_private) +{ + struct homa_interest interest; + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->server_port, + self->client_id, 20000, 1600); + + /* First call succeeds. */ + EXPECT_EQ(0, homa_interest_init_private(&interest, crpc)); + EXPECT_EQ(&interest, crpc->private_interest); + EXPECT_EQ(crpc, interest.rpc); + + /* Second call fails (rpc already has interest). */ + EXPECT_EQ(EINVAL, -homa_interest_init_private(&interest, crpc)); + + homa_interest_unlink_private(&interest); +} + +TEST_F(homa_interest, homa_interest_unlink_private) +{ + struct homa_interest interest, interest2; + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->server_port, + self->client_id, 20000, 1600); + + EXPECT_EQ(0, homa_interest_init_private(&interest, crpc)); + homa_interest_unlink_private(&interest); + EXPECT_EQ(NULL, crpc->private_interest); + + /* Second call does nothing (rpc doesn't refer to interest). */ + crpc->private_interest = &interest2; + homa_interest_unlink_private(&interest); + EXPECT_EQ(&interest2, crpc->private_interest); + + crpc->private_interest = NULL; +} + +TEST_F(homa_interest, homa_interest_wait__already_ready) +{ + struct homa_interest interest; + + homa_interest_init_shared(&interest, &self->hsk); + atomic_set(&interest.ready, 1); + EXPECT_EQ(0, homa_interest_wait(&interest)); + EXPECT_EQ(0, interest.blocked); + + homa_interest_unlink_shared(&interest); +} +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_interest, homa_interest_wait__call_schedule) +{ + struct homa_interest interest; + + homa_interest_init_shared(&interest, &self->hsk); + + self->homa.poll_cycles = 100; + unit_hook_register(log_hook); + unit_hook_register(notify_hook); + hook_interest = &interest; + hook_count = 2; + unit_log_clear(); + + EXPECT_EQ(0, homa_interest_wait(&interest)); + EXPECT_STREQ("schedule; schedule", unit_log_get()); + homa_interest_unlink_shared(&interest); +} +#endif /* See strip.py */ +TEST_F(homa_interest, homa_interest_wait__call_homa_rpc_reap) +{ + struct homa_interest interest; + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->server_port, + self->client_id, 20000, 1600); + ASSERT_NE(NULL, crpc); + homa_rpc_end(crpc); + EXPECT_EQ(15, self->hsk.dead_skbs); + + homa_interest_init_shared(&interest, &self->hsk); + + IF_NO_STRIP(self->homa.poll_cycles = 0); + unit_hook_register(notify_hook); + hook_interest = &interest; + hook_count = 1; + unit_log_clear(); + + EXPECT_EQ(0, homa_interest_wait(&interest)); + EXPECT_EQ(5, self->hsk.dead_skbs); + homa_interest_unlink_shared(&interest); +} +TEST_F(homa_interest, homa_interest_wait__poll_then_block) +{ + struct homa_interest interest; + + homa_interest_init_shared(&interest, &self->hsk); + IF_NO_STRIP(self->homa.poll_cycles = 3000); + mock_set_clock_vals(1000, 2000, 3999, 4000, 0); + mock_clock = 4000; + unit_hook_register(notify_hook); + hook_interest = &interest; + hook_count = 4; + + EXPECT_EQ(0, -homa_interest_wait(&interest)); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(3000, homa_metrics_per_cpu()->poll_cycles); + EXPECT_EQ(0, homa_metrics_per_cpu()->blocked_cycles); + EXPECT_EQ(1, interest.blocked); +#endif /* See strip.py */ + homa_interest_unlink_shared(&interest); +} +TEST_F(homa_interest, homa_interest_wait__interrupted_by_signal) +{ + struct homa_interest interest; + + homa_interest_init_shared(&interest, &self->hsk); + mock_prepare_to_wait_errors = 1; + IF_NO_STRIP(self->homa.poll_cycles = 0); + + EXPECT_EQ(EINTR, -homa_interest_wait(&interest)); + EXPECT_EQ(1, interest.blocked); + homa_interest_unlink_shared(&interest); +} +TEST_F(homa_interest, homa_interest_wait__time_metrics) +{ + struct homa_interest interest; + + homa_interest_init_shared(&interest, &self->hsk); + IF_NO_STRIP(self->homa.poll_cycles = 0); + mock_set_clock_vals(1000, 1500, 3000, 3200, 0); + mock_clock = 4000; + unit_hook_register(notify_hook); + hook_interest = &interest; + hook_count = 4; + + EXPECT_EQ(0, -homa_interest_wait(&interest)); + IF_NO_STRIP(EXPECT_EQ(700, homa_metrics_per_cpu()->poll_cycles)); + IF_NO_STRIP(EXPECT_EQ(1500, homa_metrics_per_cpu()->blocked_cycles)); + homa_interest_unlink_shared(&interest); +} + +TEST_F(homa_interest, homa_interest_wait__notify_private) +{ + struct homa_interest interest; + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->server_port, + self->client_id, 20000, 1600); + ASSERT_NE(NULL, crpc); + + homa_interest_init_private(&interest, crpc); + EXPECT_EQ(0, atomic_read(&interest.ready)); + unit_log_clear(); + mock_log_wakeups = 1; + + /* First call: RPC has an interest. */ + homa_interest_notify_private(crpc); + EXPECT_EQ(1, atomic_read(&interest.ready)); + EXPECT_STREQ("wake_up", unit_log_get()); + homa_interest_unlink_private(&interest); + + /* Second call: No interest on RPC. */ + unit_log_clear(); + homa_interest_notify_private(crpc); + EXPECT_STREQ("", unit_log_get()); +} + +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_interest, homa_choose_interest__find_idle_core) +{ + struct homa_interest interest1, interest2, interest3; + + homa_interest_init_shared(&interest1, &self->hsk); + interest1.core = 1; + homa_interest_init_shared(&interest2, &self->hsk); + interest2.core = 2; + homa_interest_init_shared(&interest3, &self->hsk); + interest3.core = 3; + + mock_clock = 5000; + self->homa.busy_cycles = 1000; + per_cpu(homa_offload_core, 1).last_active = 2000; + per_cpu(homa_offload_core, 2).last_active = 3500; + per_cpu(homa_offload_core, 3).last_active = 4100; + + struct homa_interest *result = homa_choose_interest(&self->hsk); + EXPECT_EQ(&interest2, result); + EXPECT_EQ(2, result->core); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->handoffs_alt_thread)); + INIT_LIST_HEAD(&self->hsk.interests); +} +TEST_F(homa_interest, homa_choose_interest__all_cores_busy) +{ + struct homa_interest interest1, interest2, interest3; + + homa_interest_init_shared(&interest1, &self->hsk); + interest1.core = 1; + homa_interest_init_shared(&interest2, &self->hsk); + interest2.core = 2; + homa_interest_init_shared(&interest3, &self->hsk); + interest3.core = 3; + + mock_clock = 5000; + self->homa.busy_cycles = 1000; + per_cpu(homa_offload_core, 1).last_active = 4100; + per_cpu(homa_offload_core, 2).last_active = 4001; + per_cpu(homa_offload_core, 3).last_active = 4800; + + struct homa_interest *result = homa_choose_interest(&self->hsk); + EXPECT_EQ(3, result->core); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->handoffs_alt_thread)); + INIT_LIST_HEAD(&self->hsk.interests); +} +#endif /* See strip.py */ diff --git a/test/unit_homa_lcache.c b/test/unit_homa_lcache.c deleted file mode 100644 index d433e090..00000000 --- a/test/unit_homa_lcache.c +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright (c) 2021-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include "homa_impl.h" -#include "homa_lcache.h" -#define KSELFTEST_NOT_MAIN 1 -#include "kselftest_harness.h" -#include "ccutils.h" -#include "mock.h" -#include "utils.h" - -FIXTURE(homa_lcache) { - struct homa_lcache cache; - struct homa homa; - struct homa_sock hsk; - struct homa_rpc *crpc; - struct homa_rpc *srpc; - struct in6_addr client_ip[2]; - struct in6_addr server_ip[1]; -}; -FIXTURE_SETUP(homa_lcache) -{ - homa_lcache_init(&self->cache); - homa_init(&self->homa); - mock_sock_init(&self->hsk, &self->homa, 0); - self->server_ip[0] = unit_get_in_addr("1.2.3.4"); - self->client_ip[0] = unit_get_in_addr("196.168.0.1"); - self->client_ip[0] = unit_get_in_addr("196.168.0.2"); - self->crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, - self->client_ip, - self->server_ip, - 99, 1234, 1000, 1000); - self->srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, - self->client_ip, - self->server_ip, - 40000, 1235, 1000, 1000); -} -FIXTURE_TEARDOWN(homa_lcache) -{ - homa_destroy(&self->homa); - unit_teardown(); -} - -TEST_F(homa_lcache, constructor) -{ - EXPECT_TRUE(self->cache.rpc == NULL); -} - -TEST_F(homa_lcache, homa_lcache_save__empty) -{ - homa_lcache_save(&self->cache, self->crpc); - EXPECT_EQ(self->crpc, self->cache.rpc); -} -TEST_F(homa_lcache, homa_lcache_save__full) -{ - homa_rpc_lock(self->crpc); - homa_lcache_save(&self->cache, self->crpc); - homa_rpc_lock(self->srpc); - homa_lcache_save(&self->cache, self->srpc); - EXPECT_EQ(self->srpc, self->cache.rpc); - homa_lcache_release(&self->cache); -} - -TEST_F(homa_lcache, homa_lcache_release) -{ - homa_lcache_release(&self->cache); - homa_rpc_lock(self->crpc); - homa_lcache_save(&self->cache, self->crpc); - homa_lcache_release(&self->cache); - EXPECT_TRUE(self->cache.rpc == NULL); -} - -TEST_F(homa_lcache, homa_lcache_get) -{ - EXPECT_TRUE(homa_lcache_get(&self->cache, 1235, self->client_ip, - 40000) == NULL); - homa_lcache_save(&self->cache, self->srpc); - EXPECT_EQ(self->srpc, homa_lcache_get(&self->cache, 1235, - self->client_ip, 40000)); - EXPECT_TRUE(homa_lcache_get(&self->cache, 1237, self->client_ip, - 40000) == NULL); - EXPECT_TRUE(homa_lcache_get(&self->cache, 1235, self->client_ip+1, - 40000) == NULL); - EXPECT_TRUE(homa_lcache_get(&self->cache, 1235, self->client_ip, - 40001) == NULL); -} diff --git a/test/unit_homa_metrics.c b/test/unit_homa_metrics.c new file mode 100644 index 00000000..c048759a --- /dev/null +++ b/test/unit_homa_metrics.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +#include "homa_impl.h" +#define KSELFTEST_NOT_MAIN 1 +#include "kselftest_harness.h" +#include "ccutils.h" +#include "mock.h" +#include "utils.h" + +FIXTURE(homa_metrics) { + struct homa homa; +}; +FIXTURE_SETUP(homa_metrics) +{ + homa_init(&self->homa); +} +FIXTURE_TEARDOWN(homa_metrics) +{ + homa_destroy(&self->homa); + homa_metrics_end(); + unit_teardown(); +} + +TEST_F(homa_metrics, homa_metric_append) +{ + homa_mout.length = 0; + homa_metric_append("metric1", 12345, "Description 1\n"); + EXPECT_EQ(200, homa_mout.capacity); + EXPECT_EQ(66, homa_mout.length); + EXPECT_STREQ("metric1 12345 Description 1\n", + homa_mout.output); + + homa_metric_append("value with long name", 8, "Value %d, value 2 %08d\n", + 16, 44); + EXPECT_EQ(400, homa_mout.capacity); + EXPECT_EQ(145, homa_mout.length); + EXPECT_STREQ("metric1 12345 Description 1\n" + "value with long name 8 Value 16, value 2 00000044\n", + homa_mout.output); +} + +TEST_F(homa_metrics, homa_metrics_open) +{ + EXPECT_EQ(0, homa_metrics_open(NULL, NULL)); + EXPECT_NE(NULL, homa_mout.output); + + strcpy(homa_mout.output, "12345"); + EXPECT_EQ(0, homa_metrics_open(NULL, NULL)); + EXPECT_EQ(5, strlen(homa_mout.output)); + EXPECT_EQ(2, homa_mout.active_opens); +} +TEST_F(homa_metrics, homa_metrics_read__basics) +{ + loff_t offset = 10; + char buffer[1000]; + + homa_mout.output = kmalloc(100, GFP_KERNEL); + homa_mout.capacity = 100; + strcpy(homa_mout.output, "0123456789abcdefghijklmnop"); + homa_mout.length = 26; + EXPECT_EQ(5, homa_metrics_read(NULL, buffer, 5, &offset)); + EXPECT_SUBSTR("_copy_to_user copied 5 bytes", unit_log_get()); + EXPECT_EQ(15, offset); + + unit_log_clear(); + EXPECT_EQ(11, homa_metrics_read(NULL, buffer, 1000, &offset)); + EXPECT_SUBSTR("_copy_to_user copied 11 bytes", unit_log_get()); + EXPECT_EQ(26, offset); + + unit_log_clear(); + EXPECT_EQ(0, homa_metrics_read(NULL, buffer, 1000, &offset)); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(26, offset); +} +TEST_F(homa_metrics, homa_metrics_read__error_copying_to_user) +{ + loff_t offset = 10; + char buffer[1000]; + + homa_mout.output = kmalloc(100, GFP_KERNEL); + homa_mout.capacity = 100; + strcpy(homa_mout.output, "0123456789abcdefghijklmnop"); + homa_mout.length = 26; + mock_copy_to_user_errors = 1; + EXPECT_EQ(EFAULT, -homa_metrics_read(NULL, buffer, 5, &offset)); +} + +TEST_F(homa_metrics, homa_metrics_release) +{ + homa_mout.active_opens = 2; + EXPECT_EQ(0, homa_metrics_release(NULL, NULL)); + EXPECT_EQ(1, homa_mout.active_opens); + + EXPECT_EQ(0, homa_metrics_release(NULL, NULL)); + EXPECT_EQ(0, homa_mout.active_opens); +} diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index 40526f05..c64dd7a0 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -1,99 +1,215 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" -#include "homa_lcache.h" +#include "homa_offload.h" +#include "homa_rpc.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" #include "mock.h" #include "utils.h" -extern struct homa *homa; +#define cur_offload_core (&per_cpu(homa_offload_core, smp_processor_id())) -FIXTURE(homa_offload) { +static struct sk_buff *test_tcp_gro_receive(struct list_head *held_list, + struct sk_buff *skb) +{ + UNIT_LOG("; ", "test_tcp_gro_receive"); + return NULL; +} +static struct sk_buff *unit_tcp6_gro_receive(struct list_head *held_list, + struct sk_buff *skb) +{ + UNIT_LOG("; ", "unit_tcp6_gro_receive"); + return NULL; +} + +FIXTURE(homa_offload) +{ struct homa homa; + struct homa_net *hnet; struct homa_sock hsk; struct in6_addr ip; - struct data_header header; + struct homa_data_hdr header; struct napi_struct napi; struct sk_buff *skb, *skb2; struct list_head empty_list; + struct net_offload tcp_offloads; + struct net_offload tcp6_offloads; }; FIXTURE_SETUP(homa_offload) { int i; + homa_init(&self->homa); + self->hnet = mock_hnet(0, &self->homa); self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; - self->homa.grant_threshold = self->homa.rtt_bytes; - homa = &self->homa; - mock_sock_init(&self->hsk, &self->homa, 99); + self->homa.unsched_bytes = 10000; + mock_sock_init(&self->hsk, self->hnet, 99); self->ip = unit_get_in_addr("196.168.0.1"); - self->header = (struct data_header){.common = { - .sport = htons(40000), .dport = htons(99), - .type = DATA, - .sender_id = cpu_to_be64(1000)}, - .message_length = htonl(10000), - .incoming = htonl(10000), .cutoff_version = 0, - .retransmit = 0, - .seg = {.offset = htonl(2000), - .segment_length = htonl(1400), - .ack = {0, 0, 0}}}; + memset(&self->header, 0, sizeof(self->header)); + self->header.common = (struct homa_common_hdr){ + .sport = htons(40000), .dport = htons(99), + .type = DATA, + .flags = HOMA_TCP_FLAGS, + .urgent = HOMA_TCP_URGENT, + .sender_id = cpu_to_be64(1000) + }; + self->header.message_length = htonl(10000); + self->header.incoming = htonl(10000); + self->header.seg.offset = htonl(2000); for (i = 0; i < GRO_HASH_BUCKETS; i++) { - INIT_LIST_HEAD(&self->napi.gro_hash[i].list); - self->napi.gro_hash[i].count = 0; + INIT_LIST_HEAD(&self->napi.gro.hash[i].list); + self->napi.gro.hash[i].count = 0; } - self->napi.gro_bitmask = 0; + self->napi.gro.bitmask = 0; - self->skb = mock_skb_new(&self->ip, &self->header.common, 1400, 2000); + self->skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 2000); NAPI_GRO_CB(self->skb)->same_flow = 0; NAPI_GRO_CB(self->skb)->last = self->skb; NAPI_GRO_CB(self->skb)->count = 1; - self->header.seg.offset = htonl(4000); - self->header.common.dport = htons(88); - self->header.common.sender_id = cpu_to_be64(1002); - self->skb2 = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + self->header.seg.offset = htonl(4000); + self->header.common.dport = htons(88); + self->header.common.sender_id = cpu_to_be64(1002); + self->skb2 = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(self->skb2)->same_flow = 0; NAPI_GRO_CB(self->skb2)->last = self->skb2; NAPI_GRO_CB(self->skb2)->count = 1; - self->napi.gro_bitmask = 6; - self->napi.gro_hash[2].count = 2; - list_add_tail(&self->skb->list, &self->napi.gro_hash[2].list); - list_add_tail(&self->skb2->list, &self->napi.gro_hash[2].list); + self->napi.gro.bitmask = 6; + self->napi.gro.hash[2].count = 2; + list_add_tail(&self->skb->list, &self->napi.gro.hash[2].list); + list_add_tail(&self->skb2->list, &self->napi.gro.hash[2].list); INIT_LIST_HEAD(&self->empty_list); + self->tcp_offloads.callbacks.gro_receive = test_tcp_gro_receive; + inet_offloads[IPPROTO_TCP] = &self->tcp_offloads; + self->tcp6_offloads.callbacks.gro_receive = unit_tcp6_gro_receive; + inet6_offloads[IPPROTO_TCP] = &self->tcp6_offloads; + homa_offload_init(); + unit_log_clear(); + + /* Configure so core isn't considered too busy for bypasses. */ + mock_clock = 1000; + self->homa.gro_busy_cycles = 500; + cur_offload_core->last_gro = 400; } FIXTURE_TEARDOWN(homa_offload) { - struct sk_buff *skb, *tmp; + struct sk_buff *skb, *tmp; - list_for_each_entry_safe(skb, tmp, &self->napi.gro_hash[2].list, list) + homa_offload_end(); + list_for_each_entry_safe(skb, tmp, &self->napi.gro.hash[2].list, list) kfree_skb(skb); homa_destroy(&self->homa); - homa = NULL; unit_teardown(); } +TEST_F(homa_offload, homa_gro_hook_tcp) +{ + homa_gro_hook_tcp(); + EXPECT_EQ(&homa_tcp_gro_receive, + inet_offloads[IPPROTO_TCP]->callbacks.gro_receive); + EXPECT_EQ(&homa_tcp_gro_receive, + inet6_offloads[IPPROTO_TCP]->callbacks.gro_receive); + + /* Second hook call should do nothing. */ + homa_gro_hook_tcp(); + + homa_gro_unhook_tcp(); + EXPECT_EQ(&test_tcp_gro_receive, + inet_offloads[IPPROTO_TCP]->callbacks.gro_receive); + EXPECT_EQ(&unit_tcp6_gro_receive, + inet6_offloads[IPPROTO_TCP]->callbacks.gro_receive); + + /* Second unhook call should do nothing. */ + homa_gro_unhook_tcp(); + EXPECT_EQ(&test_tcp_gro_receive, + inet_offloads[IPPROTO_TCP]->callbacks.gro_receive); + EXPECT_EQ(&unit_tcp6_gro_receive, + inet6_offloads[IPPROTO_TCP]->callbacks.gro_receive); +} + +TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_tcp) +{ + struct homa_common_hdr *h; + struct sk_buff *skb; + + homa_gro_hook_tcp(); + self->header.seg.offset = htonl(6000); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); + h = (struct homa_common_hdr *) skb_transport_header(skb); + h->flags = 0; + EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->empty_list, skb)); + EXPECT_STREQ("test_tcp_gro_receive", unit_log_get()); + kfree_skb(skb); + unit_log_clear(); + + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); + h = (struct homa_common_hdr *)skb_transport_header(skb); + h->urgent -= 1; + EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->empty_list, skb)); + EXPECT_STREQ("test_tcp_gro_receive", unit_log_get()); + kfree_skb(skb); + homa_gro_unhook_tcp(); +} +TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv6) +{ + struct homa_common_hdr *h; + struct sk_buff *skb; + + mock_ipv6 = true; + homa_gro_hook_tcp(); + self->header.seg.offset = htonl(6000); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); + ip_hdr(skb)->protocol = IPPROTO_TCP; + h = (struct homa_common_hdr *)skb_transport_header(skb); + h->flags = HOMA_TCP_FLAGS; + h->urgent = htons(HOMA_TCP_URGENT); + NAPI_GRO_CB(skb)->same_flow = 0; + cur_offload_core->held_skb = NULL; + cur_offload_core->held_bucket = 99; + EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->empty_list, skb)); + EXPECT_EQ(skb, cur_offload_core->held_skb); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(IPPROTO_HOMA, ipv6_hdr(skb)->nexthdr); + kfree_skb(skb); + homa_gro_unhook_tcp(); +} +TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv4) +{ + struct homa_common_hdr *h; + struct sk_buff *skb; + + mock_ipv6 = false; + homa_gro_hook_tcp(); + self->header.seg.offset = htonl(6000); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); + ip_hdr(skb)->protocol = IPPROTO_TCP; + h = (struct homa_common_hdr *)skb_transport_header(skb); + h->flags = HOMA_TCP_FLAGS; + h->urgent = htons(HOMA_TCP_URGENT); + NAPI_GRO_CB(skb)->same_flow = 0; + cur_offload_core->held_skb = NULL; + cur_offload_core->held_bucket = 99; + EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->empty_list, skb)); + EXPECT_EQ(skb, cur_offload_core->held_skb); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(IPPROTO_HOMA, ip_hdr(skb)->protocol); + EXPECT_EQ(29695, ip_hdr(skb)->check); + kfree_skb(skb); + homa_gro_unhook_tcp(); +} + TEST_F(homa_offload, homa_gso_segment_set_ip_ids) { + struct sk_buff *skb, *segs; + int version; + mock_ipv6 = false; - struct sk_buff *skb = mock_skb_new(&self->ip, &self->header.common, - 1400, 2000); - int version = ip_hdr(skb)->version; + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 2000); + version = ip_hdr(skb)->version; EXPECT_EQ(4, version); - struct sk_buff *segs = homa_gso_segment(skb, 0); + segs = homa_gso_segment(skb, 0); ASSERT_NE(NULL, segs); ASSERT_NE(NULL, segs->next); EXPECT_EQ(NULL, segs->next->next); @@ -104,156 +220,255 @@ TEST_F(homa_offload, homa_gso_segment_set_ip_ids) kfree_skb(segs); } -TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) +TEST_F(homa_offload, homa_gro_receive__update_offset_from_sequence) { - struct in6_addr client_ip = unit_get_in_addr("196.168.0.1"); - struct in6_addr server_ip = unit_get_in_addr("1.2.3.4"); - int client_port = 40000; - __u64 client_id = 1234; - __u64 server_id = 1235; - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, - &client_ip, &server_ip, client_port, server_id, 100, - 20000); - ASSERT_NE(NULL, srpc); - homa_xmit_data(srpc, false); - unit_log_clear(); + struct sk_buff *skb, *skb2; + struct homa_data_hdr *h; - struct grant_header h = {{.sport = htons(srpc->dport), - .dport = htons(self->hsk.port), - .sender_id = cpu_to_be64(client_id), - .type = GRANT}, - .offset = htonl(12600), - .priority = 3}; - self->homa.gro_policy = HOMA_GRO_FAST_GRANTS; - struct sk_buff *result = homa_gro_receive(&self->empty_list, - mock_skb_new(&client_ip, &h.common, 0, 0)); - EXPECT_EQ(EINPROGRESS, -PTR_ERR(result)); - EXPECT_EQ(12600, srpc->msgout.granted); - EXPECT_STREQ("xmit DATA 1400@11200", unit_log_get()); + /* First call: copy offset from sequence number. */ + self->header.common.sequence = htonl(6000); + self->header.seg.offset = -1; + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); + NAPI_GRO_CB(skb)->same_flow = 0; + cur_offload_core->held_skb = NULL; + cur_offload_core->held_bucket = 99; + EXPECT_EQ(NULL, homa_gro_receive(&self->empty_list, skb)); + h = (struct homa_data_hdr *) skb_transport_header(skb); + EXPECT_EQ(6000, htonl(h->seg.offset)); + + /* Second call: offset already valid. */ + self->header.common.sequence = htonl(6000); + self->header.seg.offset = ntohl(5000); + skb2 = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); + NAPI_GRO_CB(skb2)->same_flow = 0; + EXPECT_EQ(NULL, homa_gro_receive(&self->empty_list, skb2)); + h = (struct homa_data_hdr *)skb_transport_header(skb2); + EXPECT_EQ(5000, htonl(h->seg.offset)); - unit_log_clear(); - h.offset = htonl(14000); - self->homa.gro_policy = 0; - struct sk_buff *skb = mock_skb_new(&client_ip, &h.common, 0, 0); - result = homa_gro_receive(&self->empty_list, skb); - EXPECT_EQ(NULL, result); - EXPECT_EQ(12600, srpc->msgout.granted); - EXPECT_STREQ("", unit_log_get()); kfree_skb(skb); + kfree_skb(skb2); } TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) { struct in6_addr client_ip = unit_get_in_addr("196.168.0.1"); struct in6_addr server_ip = unit_get_in_addr("1.2.3.4"); + struct sk_buff *skb, *skb2, *skb3, *skb4, *result; int client_port = 40000; + u64 client_id = 1234; + u64 server_id = 1235; + struct homa_rpc *srpc; int server_port = 99; - __u64 client_id = 1234; - __u64 server_id = 1235; - struct data_header h = {.common = { - .sport = htons(40000), .dport = htons(server_port), - .type = DATA, - .sender_id = cpu_to_be64(client_id)}, - .message_length = htonl(10000), - .incoming = htonl(10000), .cutoff_version = 0, - .retransmit = 0, - .seg = {.offset = htonl(2000), - .segment_length = htonl(1400), - .ack = {0, 0, 0}}}; - struct sk_buff *skb, *skb2, *skb3; - - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, + struct homa_data_hdr h; + + memset(&h, 0, sizeof(h)); + h.common.sport = htons(40000); + h.common.dport = htons(server_port); + h.common.type = DATA; + h.common.sender_id = cpu_to_be64(client_id); + h.message_length = htonl(10000); + h.incoming = htonl(10000); + h.seg.offset = htonl(2000); + + srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &client_ip, &server_ip, client_port, server_id, 10000, 200); ASSERT_NE(NULL, srpc); unit_log_clear(); /* First attempt: HOMA_GRO_SHORT_BYPASS not enabled. */ - skb = mock_skb_new(&self->ip, &h.common, 1400, 2000); - struct sk_buff *result = homa_gro_receive(&self->empty_list, skb); + skb = mock_skb_alloc(&self->ip, &h.common, 1400, 2000); + result = homa_gro_receive(&self->empty_list, skb); EXPECT_EQ(0, -PTR_ERR(result)); - EXPECT_EQ(8600, srpc->msgin.bytes_remaining); + EXPECT_EQ(0, homa_metrics_per_cpu()->gro_data_bypasses); - /* Second attempt: HOMA_GRO_SHORT_BYPASS enabled but packet too long. */ + /* Second attempt: HOMA_GRO_SHORT_BYPASS enabled but message longer + * than one packet. + */ self->homa.gro_policy |= HOMA_GRO_SHORT_BYPASS; - skb2 = mock_skb_new(&self->ip, &h.common, 1400, 3000); + cur_offload_core->last_gro = 400; + skb2 = mock_skb_alloc(&self->ip, &h.common, 1400, 2000); result = homa_gro_receive(&self->empty_list, skb2); EXPECT_EQ(0, -PTR_ERR(result)); - EXPECT_EQ(8600, srpc->msgin.bytes_remaining); + EXPECT_EQ(0, homa_metrics_per_cpu()->gro_data_bypasses); /* Third attempt: bypass should happen. */ - h.seg.segment_length = htonl(100); - skb3 = mock_skb_new(&self->ip, &h.common, 100, 4000); + h.message_length = htonl(1400); + h.incoming = htonl(1400); + cur_offload_core->last_gro = 400; + skb3 = mock_skb_alloc(&self->ip, &h.common, 1400, 4000); result = homa_gro_receive(&self->empty_list, skb3); EXPECT_EQ(EINPROGRESS, -PTR_ERR(result)); - EXPECT_EQ(8500, srpc->msgin.bytes_remaining); + EXPECT_EQ(1, homa_metrics_per_cpu()->gro_data_bypasses); + + /* Third attempt: no bypass because core busy. */ + cur_offload_core->last_gro = 600; + skb4 = mock_skb_alloc(&self->ip, &h.common, 1400, 4000); + result = homa_gro_receive(&self->empty_list, skb3); + EXPECT_EQ(0, -PTR_ERR(result)); + EXPECT_EQ(1, homa_metrics_per_cpu()->gro_data_bypasses); kfree_skb(skb); kfree_skb(skb2); + kfree_skb(skb4); +} +TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) +{ + struct in6_addr client_ip = unit_get_in_addr("196.168.0.1"); + struct in6_addr server_ip = unit_get_in_addr("1.2.3.4"); + struct sk_buff *skb, *skb2, *skb3, *result; + struct homa_grant_hdr h; + int client_port = 40000; + u64 client_id = 1234; + u64 server_id = 1235; + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, + &client_ip, &server_ip, client_port, server_id, 100, + 20000); + ASSERT_NE(NULL, srpc); + homa_rpc_lock(srpc); + homa_xmit_data(srpc, false); + homa_rpc_unlock(srpc); + unit_log_clear(); + + h.common.sport = htons(srpc->dport); + h.common.dport = htons(self->hsk.port); + h.common.sender_id = cpu_to_be64(client_id); + h.common.type = GRANT; + h.offset = htonl(11000); + h.priority = 3; + + /* First attempt: HOMA_GRO_FAST_GRANTS not enabled. */ + self->homa.gro_policy = 0; + skb = mock_skb_alloc(&client_ip, &h.common, 0, 0); + result = homa_gro_receive(&self->empty_list, skb); + EXPECT_EQ(0, -PTR_ERR(result)); + EXPECT_EQ(0, homa_metrics_per_cpu()->gro_grant_bypasses); + EXPECT_STREQ("", unit_log_get()); + + /* Second attempt: HOMA_FAST_GRANTS is enabled. */ + self->homa.gro_policy = HOMA_GRO_FAST_GRANTS; + cur_offload_core->last_gro = 400; + skb2 = mock_skb_alloc(&client_ip, &h.common, 0, 0); + result = homa_gro_receive(&self->empty_list, skb2); + EXPECT_EQ(EINPROGRESS, -PTR_ERR(result)); + EXPECT_EQ(1, homa_metrics_per_cpu()->gro_grant_bypasses); + EXPECT_SUBSTR("xmit DATA 1400@10000", unit_log_get()); + + /* Third attempt: core is too busy for fast grants. */ + cur_offload_core->last_gro = 600; + skb3 = mock_skb_alloc(&client_ip, &h.common, 0, 0); + result = homa_gro_receive(&self->empty_list, skb3); + EXPECT_EQ(0, -PTR_ERR(result)); + EXPECT_EQ(1, homa_metrics_per_cpu()->gro_grant_bypasses); + kfree_skb(skb); + kfree_skb(skb3); } TEST_F(homa_offload, homa_gro_receive__no_held_skb) { struct sk_buff *skb; int same_flow; + self->header.seg.offset = htonl(6000); - skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); + skb->hash = 2; NAPI_GRO_CB(skb)->same_flow = 0; - homa_cores[cpu_number]->held_skb = NULL; - homa_cores[cpu_number]->held_bucket = 99; - EXPECT_EQ(NULL, homa_gro_receive(&self->empty_list, skb)); + cur_offload_core->held_skb = NULL; + cur_offload_core->held_bucket = 2; + EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro.hash[2].list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; EXPECT_EQ(0, same_flow); - EXPECT_EQ(skb, homa_cores[cpu_number]->held_skb); - EXPECT_EQ(3, homa_cores[cpu_number]->held_bucket); + EXPECT_EQ(skb, cur_offload_core->held_skb); + EXPECT_EQ(2, cur_offload_core->held_bucket); kfree_skb(skb); } TEST_F(homa_offload, homa_gro_receive__empty_merge_list) { struct sk_buff *skb; int same_flow; + self->header.seg.offset = htonl(6000); - skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); + skb->hash = 2; NAPI_GRO_CB(skb)->same_flow = 0; - homa_cores[cpu_number]->held_skb = skb; - homa_cores[cpu_number]->held_bucket = 3; - EXPECT_EQ(NULL, homa_gro_receive(&self->empty_list, skb)); + cur_offload_core->held_skb = self->skb; + cur_offload_core->held_bucket = 3; + EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro.hash[2].list, skb)); + same_flow = NAPI_GRO_CB(skb)->same_flow; + EXPECT_EQ(0, same_flow); + EXPECT_EQ(skb, cur_offload_core->held_skb); + EXPECT_EQ(2, cur_offload_core->held_bucket); + kfree_skb(skb); +} +TEST_F(homa_offload, homa_gro_receive__held_skb_not_in_merge_list) +{ + struct sk_buff *skb; + int same_flow; + + self->header.seg.offset = htonl(6000); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); + skb->hash = 3; + NAPI_GRO_CB(skb)->same_flow = 0; + cur_offload_core->held_skb = skb; + cur_offload_core->held_bucket = 2; + EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro.hash[3].list, skb)); + same_flow = NAPI_GRO_CB(skb)->same_flow; + EXPECT_EQ(0, same_flow); + EXPECT_EQ(skb, cur_offload_core->held_skb); + EXPECT_EQ(3, cur_offload_core->held_bucket); + kfree_skb(skb); +} +TEST_F(homa_offload, homa_gro_receive__held_skb__in_merge_list_but_wrong_proto) +{ + struct sk_buff *skb; + int same_flow; + + self->header.seg.offset = htonl(6000); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); + skb->hash = 3; + NAPI_GRO_CB(skb)->same_flow = 0; + cur_offload_core->held_skb = self->skb; + if (skb_is_ipv6(self->skb)) + ipv6_hdr(self->skb)->nexthdr = IPPROTO_TCP; + else + ip_hdr(self->skb)->protocol = IPPROTO_TCP; + cur_offload_core->held_bucket = 2; + EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro.hash[3].list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; EXPECT_EQ(0, same_flow); - EXPECT_EQ(skb, homa_cores[cpu_number]->held_skb); - EXPECT_EQ(3, homa_cores[cpu_number]->held_bucket); + EXPECT_EQ(skb, cur_offload_core->held_skb); + EXPECT_EQ(3, cur_offload_core->held_bucket); kfree_skb(skb); } TEST_F(homa_offload, homa_gro_receive__merge) { struct sk_buff *skb, *skb2; int same_flow; - homa_cores[cpu_number]->held_skb = self->skb2; - homa_cores[cpu_number]->held_bucket = 2; + + cur_offload_core->held_skb = self->skb2; + cur_offload_core->held_bucket = 2; self->header.seg.offset = htonl(6000); self->header.common.sender_id = cpu_to_be64(1002); - skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(skb)->same_flow = 0; - EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro_hash[3].list, skb)); + EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro.hash[3].list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; EXPECT_EQ(1, same_flow); EXPECT_EQ(2, NAPI_GRO_CB(self->skb2)->count); self->header.seg.offset = htonl(7000); self->header.common.sender_id = cpu_to_be64(1004); - skb2 = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb2 = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(skb2)->same_flow = 0; - EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro_hash[3].list, skb2)); + EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro.hash[3].list, skb2)); same_flow = NAPI_GRO_CB(skb)->same_flow; EXPECT_EQ(1, same_flow); EXPECT_EQ(3, NAPI_GRO_CB(self->skb2)->count); unit_log_frag_list(self->skb2, 1); - EXPECT_STREQ("DATA from 196.168.0.1:40000, dport 88, id 1002, " - "message_length 10000, offset 6000, " - "data_length 1400, incoming 10000; " - "DATA from 196.168.0.1:40000, dport 88, id 1004, " - "message_length 10000, offset 7000, " - "data_length 1400, incoming 10000", + EXPECT_STREQ("DATA from 196.168.0.1:40000, dport 88, id 1002, message_length 10000, offset 6000, data_length 1400, incoming 10000; " + "DATA from 196.168.0.1:40000, dport 88, id 1004, message_length 10000, offset 7000, data_length 1400, incoming 10000", unit_log_get()); } TEST_F(homa_offload, homa_gro_receive__max_gro_skbs) @@ -261,101 +476,169 @@ TEST_F(homa_offload, homa_gro_receive__max_gro_skbs) struct sk_buff *skb; // First packet: fits below the limit. - homa->max_gro_skbs = 3; - homa_cores[cpu_number]->held_skb = self->skb2; - homa_cores[cpu_number]->held_bucket = 2; + self->homa.max_gro_skbs = 3; + cur_offload_core->held_skb = self->skb2; + cur_offload_core->held_bucket = 2; self->header.seg.offset = htonl(6000); - skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); - homa_gro_receive(&self->napi.gro_hash[3].list, skb); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); + homa_gro_receive(&self->napi.gro.hash[3].list, skb); EXPECT_EQ(2, NAPI_GRO_CB(self->skb2)->count); - EXPECT_EQ(2, self->napi.gro_hash[2].count); + EXPECT_EQ(2, self->napi.gro.hash[2].count); // Second packet hits the limit. self->header.common.sport = htons(40001); - skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); unit_log_clear(); EXPECT_EQ(EINPROGRESS, -PTR_ERR(homa_gro_receive( - &self->napi.gro_hash[3].list, skb))); + &self->napi.gro.hash[3].list, skb))); EXPECT_EQ(3, NAPI_GRO_CB(self->skb2)->count); - EXPECT_EQ(1, self->napi.gro_hash[2].count); + EXPECT_EQ(1, self->napi.gro.hash[2].count); EXPECT_STREQ("netif_receive_skb, id 1002, offset 4000", unit_log_get()); kfree_skb(self->skb2); - EXPECT_EQ(1, self->napi.gro_hash[2].count); - EXPECT_EQ(6, self->napi.gro_bitmask); + EXPECT_EQ(1, self->napi.gro.hash[2].count); + EXPECT_EQ(6, self->napi.gro.bitmask); // Third packet also hits the limit for skb, causing the bucket // to become empty. - homa->max_gro_skbs = 2; - homa_cores[cpu_number]->held_skb = self->skb; - skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + self->homa.max_gro_skbs = 2; + cur_offload_core->held_skb = self->skb; + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); unit_log_clear(); EXPECT_EQ(EINPROGRESS, -PTR_ERR(homa_gro_receive( - &self->napi.gro_hash[3].list, skb))); + &self->napi.gro.hash[3].list, skb))); EXPECT_EQ(2, NAPI_GRO_CB(self->skb)->count); - EXPECT_EQ(0, self->napi.gro_hash[2].count); - EXPECT_EQ(2, self->napi.gro_bitmask); + EXPECT_EQ(0, self->napi.gro.hash[2].count); + EXPECT_EQ(2, self->napi.gro.bitmask); EXPECT_STREQ("netif_receive_skb, id 1000, offset 2000", unit_log_get()); kfree_skb(self->skb); } -TEST_F(homa_offload, homa_gro_complete__GRO_IDLE_NEW) +TEST_F(homa_offload, homa_gro_gen2) { - homa->gro_policy = HOMA_GRO_IDLE_NEW; - mock_cycles = 1000; - homa->gro_busy_cycles = 100; - cpu_number = 5; - atomic_set(&homa_cores[6]->softirq_backlog, 1); - homa_cores[6]->last_gro = 0; - atomic_set(&homa_cores[7]->softirq_backlog, 0); - homa_cores[7]->last_gro = 901; - atomic_set(&homa_cores[0]->softirq_backlog, 2); - homa_cores[0]->last_gro = 0; - atomic_set(&homa_cores[1]->softirq_backlog, 0); - homa_cores[1]->last_gro = 899; - atomic_set(&homa_cores[2]->softirq_backlog, 0); - homa_cores[2]->last_gro = 0; + self->homa.gro_policy = HOMA_GRO_GEN2; + mock_clock = 1000; + self->homa.busy_cycles = 100; + mock_set_core(5); + atomic_set(&per_cpu(homa_offload_core, 6).softirq_backlog, 1); + per_cpu(homa_offload_core, 6).last_gro = 0; + atomic_set(&per_cpu(homa_offload_core, 7).softirq_backlog, 0); + per_cpu(homa_offload_core, 7).last_gro = 901; + atomic_set(&per_cpu(homa_offload_core, 0).softirq_backlog, 2); + per_cpu(homa_offload_core, 0).last_gro = 0; + atomic_set(&per_cpu(homa_offload_core, 1).softirq_backlog, 0); + per_cpu(homa_offload_core, 1).last_gro = 899; + atomic_set(&per_cpu(homa_offload_core, 2).softirq_backlog, 0); + per_cpu(homa_offload_core, 2).last_gro = 0; // Avoid busy cores. homa_gro_complete(self->skb, 0); EXPECT_EQ(1, self->skb->hash - 32); - EXPECT_EQ(1, atomic_read(&homa_cores[1]->softirq_backlog)); + EXPECT_EQ(1, atomic_read(&per_cpu(homa_offload_core, 1).softirq_backlog)); // All cores busy; must rotate. homa_gro_complete(self->skb, 0); EXPECT_EQ(6, self->skb->hash - 32); homa_gro_complete(self->skb, 0); EXPECT_EQ(7, self->skb->hash - 32); - EXPECT_EQ(2, homa_cores[5]->softirq_offset); + EXPECT_EQ(2, per_cpu(homa_offload_core, 5).softirq_offset); homa_gro_complete(self->skb, 0); EXPECT_EQ(0, self->skb->hash - 32); homa_gro_complete(self->skb, 0); EXPECT_EQ(1, self->skb->hash - 32); homa_gro_complete(self->skb, 0); EXPECT_EQ(6, self->skb->hash - 32); - EXPECT_EQ(1, homa_cores[5]->softirq_offset); + EXPECT_EQ(1, per_cpu(homa_offload_core, 5).softirq_offset); +} + +TEST_F(homa_offload, homa_gro_gen3__basics) +{ + struct homa_offload_core *offload_core = cur_offload_core; + struct homa_offload_core *offload3 = &per_cpu(homa_offload_core, 3); + struct homa_offload_core *offload5 = &per_cpu(homa_offload_core, 5); + struct homa_offload_core *offload7 = &per_cpu(homa_offload_core, 7); + + self->homa.gro_policy = HOMA_GRO_GEN3; + offload_core->gen3_softirq_cores[0] = 3; + offload_core->gen3_softirq_cores[1] = 7; + offload_core->gen3_softirq_cores[2] = 5; + offload3->last_app_active = 4100; + offload7->last_app_active = 3900; + offload5->last_app_active = 2000; + mock_clock = 5000; + self->homa.busy_cycles = 1000; + + homa_gro_complete(self->skb, 0); + EXPECT_EQ(7, self->skb->hash - 32); + EXPECT_EQ(0, offload3->last_active); + EXPECT_EQ(5000, offload7->last_active); +} +TEST_F(homa_offload, homa_gro_gen3__stop_on_negative_core_id) +{ + struct homa_offload_core *offload_core = cur_offload_core; + + self->homa.gro_policy = HOMA_GRO_GEN3; + offload_core->gen3_softirq_cores[0] = 3; + offload_core->gen3_softirq_cores[1] = -1; + offload_core->gen3_softirq_cores[2] = 5; + per_cpu(homa_offload_core, 3).last_app_active = 4100; + per_cpu(homa_offload_core, 5).last_app_active = 2000; + mock_clock = 5000; + self->homa.busy_cycles = 1000; + + homa_gro_complete(self->skb, 0); + EXPECT_EQ(3, self->skb->hash - 32); + EXPECT_EQ(5000, per_cpu(homa_offload_core, 3).last_active); +} +TEST_F(homa_offload, homa_gro_gen3__all_cores_busy_so_pick_first) +{ + struct homa_offload_core *offload_core = cur_offload_core; + + self->homa.gro_policy = HOMA_GRO_GEN3; + offload_core->gen3_softirq_cores[0] = 3; + offload_core->gen3_softirq_cores[1] = 7; + offload_core->gen3_softirq_cores[2] = 5; + per_cpu(homa_offload_core, 3).last_app_active = 4100; + per_cpu(homa_offload_core, 7).last_app_active = 4001; + per_cpu(homa_offload_core, 5).last_app_active = 4500; + mock_clock = 5000; + self->homa.busy_cycles = 1000; + + homa_gro_complete(self->skb, 0); + EXPECT_EQ(3, self->skb->hash - 32); + EXPECT_EQ(5000, per_cpu(homa_offload_core, 3).last_active); } + +TEST_F(homa_offload, homa_gro_complete__clear_held_skb) +{ + struct homa_offload_core *offload_core = &per_cpu(homa_offload_core, + smp_processor_id()); + + offload_core->held_skb = self->skb2; + homa_gro_complete(self->skb, 0); + EXPECT_EQ(NULL, offload_core->held_skb); +} TEST_F(homa_offload, homa_gro_complete__GRO_IDLE) { - homa->gro_policy = HOMA_GRO_IDLE; - homa_cores[6]->last_active = 30; - homa_cores[7]->last_active = 25; - homa_cores[0]->last_active = 20; - homa_cores[1]->last_active = 15; - homa_cores[2]->last_active = 10; - - cpu_number = 5; + self->homa.gro_policy = HOMA_GRO_IDLE; + per_cpu(homa_offload_core, 6).last_active = 30; + per_cpu(homa_offload_core, 7).last_active = 25; + per_cpu(homa_offload_core, 0).last_active = 20; + per_cpu(homa_offload_core, 1).last_active = 15; + per_cpu(homa_offload_core, 2).last_active = 10; + + mock_set_core(5); homa_gro_complete(self->skb, 0); EXPECT_EQ(1, self->skb->hash - 32); - homa_cores[6]->last_active = 5; - cpu_number = 5; + per_cpu(homa_offload_core, 6).last_active = 5; + mock_set_core(5); homa_gro_complete(self->skb, 0); EXPECT_EQ(6, self->skb->hash - 32); - cpu_number = 6; + mock_set_core(6); homa_gro_complete(self->skb, 0); EXPECT_EQ(2, self->skb->hash - 32); } diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 43295d39..efe136ab 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -1,30 +1,69 @@ -/* Copyright (c) 2019-2023 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" +#include "homa_grant.h" +#include "homa_peer.h" +#include "homa_rpc.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" #include "mock.h" #include "utils.h" -int get_offset(struct sk_buff *skb) +#ifndef __STRIP__ /* See strip.py */ +#include "homa_pacer.h" +#include "homa_qdisc.h" +#include "homa_skb.h" +#else /* See strip.py */ +#include "homa_stub.h" +#endif /* See strip.py */ + +#ifndef __STRIP__ /* See strip.py */ +#define XMIT_DATA(rpc, force) homa_xmit_data(rpc, force) +#else /* See strip.py */ +#define XMIT_DATA(rpc, force) homa_xmit_data(rpc) +#endif /* See strip.py */ + +/* The following hook function frees hook_rpc. */ +static struct homa_rpc *hook_rpc; +static void unlock_hook(char *id) +{ + if (strcmp(id, "unlock") != 0) + return; + if (hook_rpc) { + homa_rpc_end(hook_rpc); + hook_rpc = NULL; + } +} + +/* The following hook function frees an RPC when it is locked. */ +static void lock_free_hook(char *id) { - struct data_header *h = ((struct data_header *) - skb_transport_header(skb)); - return ntohl(h->seg.offset); + if (strcmp(id, "spin_lock") != 0) + return; + if (hook_rpc) { + homa_rpc_end(hook_rpc); + hook_rpc = NULL; + } +} + +#ifdef __STRIP__ /* See strip.py */ +static void mock_resend_data(struct homa_rpc *rpc, int start, int end, + int priority) +{ + homa_resend_data(rpc, start, end); +} +#define homa_resend_data(rpc, start, end, priority) \ + mock_resend_data(rpc, start, end, priority); +#endif /* See strip.py */ + +/* Compute the expected "truesize" value for a Homa packet, given + * the number of bytes of message data in the packet. + */ +static int true_size(int msg_bytes) +{ + return SKB_TRUESIZE(msg_bytes + HOMA_SKB_EXTRA + + sizeof(struct homa_skb_info) + sizeof(struct homa_data_hdr)); } FIXTURE(homa_outgoing) { @@ -32,11 +71,13 @@ FIXTURE(homa_outgoing) { int client_port; struct in6_addr server_ip[1]; int server_port; - __u64 client_id; - __u64 server_id; + u64 client_id; + u64 server_id; struct homa homa; + struct homa_net *hnet; + struct net_device *dev; struct homa_sock hsk; - sockaddr_in_union server_addr; + union sockaddr_in_union server_addr; struct homa_peer *peer; }; FIXTURE_SETUP(homa_outgoing) @@ -48,28 +89,36 @@ FIXTURE_SETUP(homa_outgoing) self->client_id = 1234; self->server_id = 1235; homa_init(&self->homa); - mock_cycles = 10000; - atomic64_set(&self->homa.link_idle_time, 10000); - self->homa.cycles_per_kbyte = 1000; + self->hnet = mock_hnet(0, &self->homa); + self->dev = mock_dev(0, &self->homa); + mock_clock = 10000; +#ifndef __STRIP__ /* See strip.py */ + self->homa.pacer->cycles_per_mbyte = 1000000; self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; - mock_sock_init(&self->hsk, &self->homa, self->client_port); + self->homa.unsched_bytes = 10000; + self->homa.grant->window = 10000; + self->homa.qshared->fifo_fraction = 0; +#endif /* See strip.py */ + mock_sock_init(&self->hsk, self->hnet, self->client_port); self->server_addr.in6.sin6_family = AF_INET; self->server_addr.in6.sin6_addr = self->server_ip[0]; self->server_addr.in6.sin6_port = htons(self->server_port); - self->peer = homa_peer_find(&self->homa.peers, - &self->server_addr.in6.sin6_addr, &self->hsk.inet); + self->peer = homa_peer_get(&self->hsk, + &self->server_addr.in6.sin6_addr); unit_log_clear(); } FIXTURE_TEARDOWN(homa_outgoing) { + homa_peer_release(self->peer); homa_destroy(&self->homa); unit_teardown(); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_outgoing, set_priority__priority_mapping) { + struct homa_grant_hdr h; struct homa_rpc *srpc; - struct grant_header h; srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, 1111, 10000, 10000); @@ -82,228 +131,410 @@ TEST_F(homa_outgoing, set_priority__priority_mapping) EXPECT_EQ(0, homa_xmit_control(GRANT, &h, sizeof(h), srpc)); EXPECT_STREQ("7 3", mock_xmit_prios); } +#endif /* See strip.py */ -TEST_F(homa_outgoing, homa_message_out_init__basics) +TEST_F(homa_outgoing, homa_fill_data_interleaved) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); - ASSERT_FALSE(crpc == NULL); - ASSERT_EQ(0, -homa_message_out_init(crpc, - unit_iov_iter((void *) 1000, 3000), 0)); + struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); + char buffer[1000]; + homa_rpc_unlock(crpc); - EXPECT_EQ(3000, crpc->msgout.granted); - EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_STREQ("mtu 1500, max_pkt_data 1400, gso_size 1500, " - "gso_pkt_data 1400; " - "_copy_from_iter 1400 bytes at 1000; " - "_copy_from_iter 1400 bytes at 2400; " - "_copy_from_iter 200 bytes at 3800", unit_log_get()); + homa_message_out_init(crpc, 10000); + + unit_log_clear(); + struct sk_buff *skb = homa_tx_data_pkt_alloc(crpc, iter, 10000, 5000, + 1500); + EXPECT_STREQ("_copy_from_iter 1500 bytes at 1000; " + "_copy_from_iter 1500 bytes at 2500; " + "_copy_from_iter 1500 bytes at 4000; " + "_copy_from_iter 500 bytes at 5500", unit_log_get()); + +#ifndef __STRIP__ /* See strip.py */ + EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 10000, offset 10000, data_length 1500, incoming 10000, extra segs 1500@11500 1500@13000 500@14500", + homa_print_packet(skb, buffer, sizeof(buffer))); +#else /* See strip.py */ + EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 10000, offset 10000, data_length 1500, extra segs 1500@11500 1500@13000 500@14500", + homa_print_packet(skb, buffer, sizeof(buffer))); +#endif /* See strip.py */ + EXPECT_EQ(5000 + sizeof(struct homa_data_hdr) + + 3*sizeof(struct homa_seg_hdr), skb->len); + kfree_skb(skb); +} +TEST_F(homa_outgoing, homa_fill_data_interleaved__error_copying_data) +{ + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, + &self->server_addr); + struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); + struct sk_buff *skb; + + homa_rpc_unlock(crpc); + homa_message_out_init(crpc, 10000); + unit_log_clear(); - unit_log_message_out_packets(&crpc->msgout, 1); - EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, " - "message_length 3000, offset 0, data_length 1400, " - "incoming 3000; " - "DATA from 0.0.0.0:40000, dport 99, id 2, " - "message_length 3000, offset 1400, data_length 1400, " - "incoming 3000; " - "DATA from 0.0.0.0:40000, dport 99, id 2, " - "message_length 3000, offset 2800, data_length 200, " - "incoming 3000", - unit_log_get()); - EXPECT_EQ(3, crpc->msgout.num_skbs); + mock_copy_data_errors = 1; + skb = homa_tx_data_pkt_alloc(crpc, iter, 10000, 5000, 1500); + EXPECT_EQ(EFAULT, -PTR_ERR(skb)); } -TEST_F(homa_outgoing, homa_message_out_init__gso_force_software) + +TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__one_segment) { - struct homa_rpc *crpc1 = homa_rpc_new_client(&self->hsk, + struct iov_iter *iter = unit_iov_iter((void *) 1000, 5000); + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); - ASSERT_FALSE(crpc1 == NULL); - homa_rpc_unlock(crpc1); - mock_net_device.gso_max_size = 10000; - mock_xmit_log_verbose = 1; - self->homa.gso_force_software = 0; - ASSERT_EQ(0, -homa_message_out_init(crpc1, - unit_iov_iter((void *) 1000, 5000), 0)); + struct sk_buff *skb; + char buffer[1000]; + + homa_rpc_unlock(crpc); + homa_message_out_init(crpc, 500); + unit_log_clear(); - homa_xmit_data(crpc1, false); - EXPECT_SUBSTR("xmit DATA", unit_log_get()); - EXPECT_NOSUBSTR("TSO disabled", unit_log_get()); + skb = homa_tx_data_pkt_alloc(crpc, iter, 5000, 500, 2000); + EXPECT_STREQ("_copy_from_iter 500 bytes at 1000", unit_log_get()); + +#ifndef __STRIP__ /* See strip.py */ + EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 500, offset 5000, data_length 500, incoming 500", + homa_print_packet(skb, buffer, sizeof(buffer))); +#else /* See strip.py */ + EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 500, offset 5000, data_length 500", + homa_print_packet(skb, buffer, sizeof(buffer))); +#endif /* See strip.py */ - struct homa_rpc *crpc2 = homa_rpc_new_client(&self->hsk, + EXPECT_EQ(0, skb_shinfo(skb)->gso_segs); + kfree_skb(skb); +} +TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__cant_allocate_skb) +{ + struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); - ASSERT_FALSE(crpc2 == NULL); - homa_rpc_unlock(crpc2); - self->homa.gso_force_software = 1; - ASSERT_EQ(0, -homa_message_out_init(crpc2, - unit_iov_iter((void *) 1000, 5000), 0)); + struct sk_buff *skb; + + homa_rpc_unlock(crpc); + homa_message_out_init(crpc, 500); + unit_log_clear(); - homa_xmit_data(crpc2, false); - EXPECT_SUBSTR("TSO disabled", unit_log_get()); + mock_alloc_skb_errors = 1; + skb = homa_tx_data_pkt_alloc(crpc, iter, 0, 500, 2000); + EXPECT_TRUE(IS_ERR(skb)); + EXPECT_EQ(ENOMEM, -PTR_ERR(skb)); + EXPECT_STREQ("couldn't allocate sk_buff for outgoing message", + self->hsk.error_msg); } -TEST_F(homa_outgoing, homa_message_out_init__message_too_long) +TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__include_acks) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); - ASSERT_FALSE(crpc == NULL); - ASSERT_EQ(EINVAL, -homa_message_out_init(crpc, - unit_iov_iter((void *) 1000, HOMA_MAX_MESSAGE_LENGTH+1), - 0)); + struct homa_data_hdr h; + struct sk_buff *skb; + + ASSERT_NE(NULL, crpc); homa_rpc_unlock(crpc); + + crpc->peer->acks[0] = (struct homa_ack) { + .server_port = htons(200), + .client_id = cpu_to_be64(1000)}; + crpc->peer->num_acks = 1; + + homa_message_out_init(crpc, 500); + skb = homa_tx_data_pkt_alloc(crpc, iter, 0, 500, 2000); + ASSERT_NE(NULL, skb); + + homa_skb_get(skb, &h, 0, sizeof(h)); + EXPECT_STREQ("server_port 200, client_id 1000", + unit_ack_string(&h.ack)); + kfree_skb(skb); } -TEST_F(homa_outgoing, homa_message_out_init__packet_shape_short_message) +TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__multiple_segments_homa_fill_data_interleaved) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); - ASSERT_FALSE(crpc == NULL); - unit_log_clear(); - ASSERT_EQ(0, -homa_message_out_init(crpc, - unit_iov_iter((void *) 1000, 500), 0)); + struct sk_buff *skb; + char buffer[1000]; + homa_rpc_unlock(crpc); - EXPECT_SUBSTR("mtu 1500, max_pkt_data 1400, gso_size 1500, " - "gso_pkt_data 500;", unit_log_get()); + homa_message_out_init(crpc, 10000); + + unit_log_clear(); + skb = homa_tx_data_pkt_alloc(crpc, iter, 10000, 5000, 1500); + EXPECT_STREQ("_copy_from_iter 1500 bytes at 1000; " + "_copy_from_iter 1500 bytes at 2500; " + "_copy_from_iter 1500 bytes at 4000; " + "_copy_from_iter 500 bytes at 5500", unit_log_get()); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 10000, offset 10000, data_length 1500, incoming 10000, extra segs 1500@11500 1500@13000 500@14500", + homa_print_packet(skb, buffer, sizeof(buffer))); +#else /* See strip.py */ + EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 10000, offset 10000, data_length 1500, extra segs 1500@11500 1500@13000 500@14500", + homa_print_packet(skb, buffer, sizeof(buffer))); +#endif /* See strip.py */ + + EXPECT_EQ(4*(sizeof(struct homa_data_hdr) + crpc->hsk->ip_header_length + + HOMA_ETH_OVERHEAD) + 5000, + homa_get_skb_info(skb)->wire_bytes); + EXPECT_EQ(5000, homa_get_skb_info(skb)->data_bytes); + kfree_skb(skb); } -TEST_F(homa_outgoing, homa_message_out_init__max_gso_size_limit) +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__error_in_homa_fill_data_interleaved) { - // First RPC: not limited by homa.gso_max_size. - struct homa_rpc *crpc1 = homa_rpc_new_client(&self->hsk, + struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); - ASSERT_FALSE(crpc1 == NULL); + + homa_rpc_unlock(crpc); + homa_message_out_init(crpc, 10000); + unit_log_clear(); - mock_net_device.gso_max_size = 10000; - ASSERT_EQ(0, -homa_message_out_init(crpc1, - unit_iov_iter((void *) 1000, 5000), 0)); - homa_rpc_unlock(crpc1); - EXPECT_SUBSTR("gso_size 8600, gso_pkt_data 8400;", unit_log_get()); + mock_alloc_page_errors = -1; + struct sk_buff *skb = homa_tx_data_pkt_alloc(crpc, iter, 10000, 5000, + 1500); + EXPECT_TRUE(IS_ERR(skb)); + EXPECT_EQ(ENOMEM, -PTR_ERR(skb)); +} +TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__multiple_segments_tcp_hijacking) +{ + struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); + struct homa_rpc *crpc; + struct homa_sock hsk; + struct sk_buff *skb; + char buffer[1000]; + + self->homa.hijack_tcp = 1; + mock_sock_init(&hsk, self->hnet, self->client_port+1); + crpc = homa_rpc_alloc_client(&hsk, &self->server_addr); + homa_rpc_unlock(crpc); + homa_message_out_init(crpc, 10000); - // Second RPC: limited by homa.gso_max_size. - self->homa.max_gso_size = 3000; - struct homa_rpc *crpc2 = homa_rpc_new_client(&self->hsk, + unit_log_clear(); + skb = homa_tx_data_pkt_alloc(crpc, iter, 10000, 5000, 1500); + EXPECT_STREQ("_copy_from_iter 5000 bytes at 1000", unit_log_get()); + + EXPECT_STREQ("DATA from 0.0.0.0:40001, dport 99, id 2, message_length 10000, offset 10000, data_length 1500, incoming 10000, extra segs 1500@11500 1500@13000 500@14500", + homa_print_packet(skb, buffer, sizeof(buffer))); + kfree_skb(skb); + unit_sock_destroy(&hsk); +} +TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__error_copying_data_hijacking_path) +{ + struct iov_iter *iter = unit_iov_iter((void *) 1000, 5000); + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); - ASSERT_FALSE(crpc2 == NULL); + struct sk_buff *skb; + + homa_rpc_unlock(crpc); + homa_message_out_init(crpc, 500); + unit_log_clear(); - ASSERT_EQ(0, -homa_message_out_init(crpc2, - unit_iov_iter((void *) 1000, 5000), 0)); - homa_rpc_unlock(crpc2); - EXPECT_SUBSTR("gso_size 2920, gso_pkt_data 2800;", unit_log_get()); + mock_copy_data_errors = 1; + skb = homa_tx_data_pkt_alloc(crpc, iter, 5000, 500, 2000); + EXPECT_TRUE(IS_ERR(skb)); + EXPECT_EQ(EFAULT, -PTR_ERR(skb)); } -TEST_F(homa_outgoing, homa_message_out_init__gso_limit_less_than_mtu) +#endif /* See strip.py */ +TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__gso_information) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); - ASSERT_FALSE(crpc == NULL); + struct sk_buff *skb; + + homa_rpc_unlock(crpc); + homa_message_out_init(crpc, 10000); + unit_log_clear(); - mock_net_device.gso_max_size = 10000; - self->homa.max_gso_size = 1000; - ASSERT_EQ(0, -homa_message_out_init(crpc, - unit_iov_iter((void *) 1000, 5000), 0)); + skb = homa_tx_data_pkt_alloc(crpc, iter, 10000, 5000, 1500); + + EXPECT_EQ(4, skb_shinfo(skb)->gso_segs); + EXPECT_EQ(1500 + sizeof(struct homa_seg_hdr), + skb_shinfo(skb)->gso_size); + EXPECT_EQ(SKB_GSO_TCPV6, skb_shinfo(skb)->gso_type); + kfree_skb(skb); +} +TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__gso_force_software) +{ + struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, + &self->server_addr); + struct sk_buff *skb; + homa_rpc_unlock(crpc); - EXPECT_SUBSTR("gso_size 1500, gso_pkt_data 1400;", unit_log_get()); + homa_message_out_init(crpc, 10000); + self->homa.gso_force_software = 1; + + unit_log_clear(); + skb = homa_tx_data_pkt_alloc(crpc, iter, 10000, 5000, 1500); + EXPECT_EQ(13, skb_shinfo(skb)->gso_type); + kfree_skb(skb); } -TEST_F(homa_outgoing, homa_message_out_init__packet_header) + +TEST_F(homa_outgoing, homa_message_out_fill__basics) { - mock_net_device.gso_max_size = 5000; - self->homa.max_gso_size = 20000; - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); + + mock_set_ipv6(&self->hsk); + ASSERT_FALSE(crpc == NULL); - ASSERT_EQ(0, -homa_message_out_init(crpc, - unit_iov_iter((void *) 1000, 20000), 0)); + ASSERT_EQ(0, -homa_message_out_fill(crpc, + unit_iov_iter((void *) 1000, 3000), 0)); homa_rpc_unlock(crpc); - char buffer[1000]; - EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, " - "message_length 20000, offset 0, data_length 1400, " - "incoming 12600, extra segs 1400@1400 1400@2800", - homa_print_packet(crpc->msgout.packets, buffer, - sizeof(buffer))); - EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, " - "message_length 20000, offset 4200, data_length 1400, " - "incoming 12600, extra segs 1400@5600 1400@7000", - homa_print_packet(homa_get_skb_info( - crpc->msgout.packets)->next_skb, - buffer, sizeof(buffer))); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(3000, crpc->msgout.granted); +#endif /* See strip.py */ + EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); + EXPECT_SUBSTR("mtu 1496, max_seg_data 1400, max_gso_data 1400; " + "_copy_from_iter 1400 bytes at 1000; " + "_copy_from_iter 1400 bytes at 2400; " + "_copy_from_iter 200 bytes at 3800", unit_log_get()); + unit_log_clear(); + unit_log_message_out_packets(&crpc->msgout, 1); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 3000, offset 0, data_length 1400, incoming 3000; " + "DATA from 0.0.0.0:40000, dport 99, id 2, message_length 3000, offset 1400, data_length 1400, incoming 3000; " + "DATA from 0.0.0.0:40000, dport 99, id 2, message_length 3000, offset 2800, data_length 200, incoming 3000", + unit_log_get()); +#else /* See strip.py */ + EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 3000, offset 0, data_length 1400; " + "DATA from 0.0.0.0:40000, dport 99, id 2, message_length 3000, offset 1400, data_length 1400; " + "DATA from 0.0.0.0:40000, dport 99, id 2, message_length 3000, offset 2800, data_length 200", + unit_log_get()); +#endif /* See strip.py */ + EXPECT_EQ(3, crpc->msgout.num_skbs); + EXPECT_EQ(3000, crpc->msgout.copied_from_user); } -TEST_F(homa_outgoing, homa_message_out_init__cant_alloc_skb) +TEST_F(homa_outgoing, homa_message_out_fill__message_too_long) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); + ASSERT_FALSE(crpc == NULL); - mock_alloc_skb_errors = 1; - ASSERT_EQ(ENOMEM, -homa_message_out_init(crpc, - unit_iov_iter((void *) 1000, 5000), 0)); + EXPECT_EQ(EINVAL, -homa_message_out_fill(crpc, + unit_iov_iter((void *) 1000, HOMA_MAX_MESSAGE_LENGTH+1), + 0)); + EXPECT_STREQ("message length exceeded HOMA_MAX_MESSAGE_LENGTH", + self->hsk.error_msg); homa_rpc_unlock(crpc); + EXPECT_EQ(0, crpc->msgout.skb_memory); + EXPECT_EQ(1, refcount_read(&self->hsk.sock.sk_wmem_alloc)); } -TEST_F(homa_outgoing, homa_message_out_init__set_gso_info) +TEST_F(homa_outgoing, homa_message_out_fill__zero_length_message) { - // First RPC: uses GSO. - mock_net_device.gso_max_size = 10000; - self->homa.max_gso_size = 4000; - struct homa_rpc *crpc1 = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); + + ASSERT_FALSE(crpc == NULL); + EXPECT_EQ(EINVAL, -homa_message_out_fill(crpc, + unit_iov_iter((void *) 1000, 0), 0)); + homa_rpc_unlock(crpc); +} +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_hijacking) +{ + struct homa_rpc *crpc1, *crpc2; + + crpc1 = homa_rpc_alloc_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc1 == NULL); - unit_log_clear(); - ASSERT_EQ(0, -homa_message_out_init(crpc1, - unit_iov_iter((void *) 1000, 2000), 0)); homa_rpc_unlock(crpc1); - EXPECT_EQ(1420, skb_shinfo(crpc1->msgout.packets)->gso_size); - // Second RPC: no GSO (message fits in one packet). - mock_net_device.gso_max_size = 10000; - self->homa.max_gso_size = 4200; - struct homa_rpc *crpc2 = homa_rpc_new_client(&self->hsk, - &self->server_addr); + crpc2 = homa_rpc_alloc_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc2 == NULL); + homa_rpc_unlock(crpc2); + + mock_set_ipv6(&self->hsk); + self->hsk.sock.sk_protocol = IPPROTO_TCP; + + /* First try: not quite enough space for 3 packets in GSO. */ + self->dev->gso_max_size = mock_mtu - 1 + + 2 * UNIT_TEST_DATA_PER_PACKET; + homa_rpc_lock(crpc1); + ASSERT_EQ(0, -homa_message_out_fill(crpc1, + unit_iov_iter((void *) 1000, 10000), 0)); + homa_rpc_unlock(crpc1); + EXPECT_SUBSTR("max_seg_data 1400, max_gso_data 2800", unit_log_get()); + + /* Second try: just barely enough space for 3 packets in GSO. */ + self->dev->gso_max_size += 1; unit_log_clear(); - ASSERT_EQ(0, -homa_message_out_init(crpc2, - unit_iov_iter((void *) 1000, 1000), 0)); + homa_rpc_lock(crpc2); + ASSERT_EQ(0, -homa_message_out_fill(crpc2, + unit_iov_iter((void *) 1000, 10000), 0)); homa_rpc_unlock(crpc2); - EXPECT_EQ(0, skb_shinfo(crpc2->msgout.packets)->gso_size); + EXPECT_SUBSTR("max_seg_data 1400, max_gso_data 4200", unit_log_get()); +} +#endif /* See strip.py */ +TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_no_hijacking) +{ + struct homa_rpc *crpc1, *crpc2; - // Thired RPC: GSO limit is one packet - mock_net_device.gso_max_size = 10000; - self->homa.max_gso_size = 1000; - struct homa_rpc *crpc3 = homa_rpc_new_client(&self->hsk, - &self->server_addr); - ASSERT_FALSE(crpc3 == NULL); + crpc1 = homa_rpc_alloc_client(&self->hsk, &self->server_addr); + ASSERT_FALSE(crpc1 == NULL); + mock_set_ipv6(&self->hsk); + + /* First try: not quite enough space for 3 packets in GSO. */ + self->dev->gso_max_size = mock_mtu - 1 + + 2 * (UNIT_TEST_DATA_PER_PACKET + + sizeof(struct homa_seg_hdr)); + ASSERT_EQ(0, -homa_message_out_fill(crpc1, + unit_iov_iter((void *) 1000, 10000), 0)); + homa_rpc_unlock(crpc1); + EXPECT_SUBSTR("max_seg_data 1400, max_gso_data 2800", unit_log_get()); + + /* Second try: just barely enough space for 3 packets in GSO. */ + crpc2 = homa_rpc_alloc_client(&self->hsk, &self->server_addr); + ASSERT_FALSE(crpc2 == NULL); + self->dev->gso_max_size += 1; unit_log_clear(); - ASSERT_EQ(0, -homa_message_out_init(crpc3, - unit_iov_iter((void *) 1000, 1000), 0)); - homa_rpc_unlock(crpc3); - EXPECT_EQ(0, skb_shinfo(crpc3->msgout.packets)->gso_size); + ASSERT_EQ(0, -homa_message_out_fill(crpc2, + unit_iov_iter((void *) 1000, 10000), 0)); + homa_rpc_unlock(crpc2); + EXPECT_SUBSTR("max_seg_data 1400, max_gso_data 4200", unit_log_get()); } -TEST_F(homa_outgoing, homa_message_out_init__include_acks) +TEST_F(homa_outgoing, homa_message_out_fill__gso_limit_less_than_mtu) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); + ASSERT_FALSE(crpc == NULL); - crpc->peer->acks[0] = (struct homa_ack) { - .client_port = htons(100), - .server_port = htons(200), - .client_id = cpu_to_be64(1000)}; - crpc->peer->num_acks = 1; - ASSERT_EQ(0, -homa_message_out_init(crpc, - unit_iov_iter((void *) 1000, 500), 0)); + unit_log_clear(); + self->dev->gso_max_size = 10000; + self->homa.max_gso_size = 1000; + ASSERT_EQ(0, -homa_message_out_fill(crpc, + unit_iov_iter((void *) 1000, 5000), 0)); homa_rpc_unlock(crpc); - struct data_header *h = (struct data_header *) crpc->msgout.packets->data; - EXPECT_STREQ("client_port 100, server_port 200, client_id 1000", - unit_ack_string(&h->seg.ack)); + EXPECT_SUBSTR("max_seg_data 1400, max_gso_data 1400;", unit_log_get()); } -TEST_F(homa_outgoing, homa_message_out_init__cant_copy_data) +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_outgoing, homa_message_out_fill__disable_overlap_xmit_because_of_homa_qdisc) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, - &self->server_addr); + struct homa_qdisc_dev *qdev; + struct homa_rpc *crpc; + + qdev = homa_qdisc_qdev_get(self->dev); + crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); + ASSERT_FALSE(crpc == NULL); - mock_copy_data_errors = 2; - ASSERT_EQ(EFAULT, -homa_message_out_init(crpc, - unit_iov_iter((void *) 1000, 3000), 0)); + ASSERT_EQ(0, -homa_message_out_fill(crpc, + unit_iov_iter((void *) 1000, 5000), 1)); homa_rpc_unlock(crpc); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("", unit_log_get()); + homa_qdisc_qdev_put(qdev); } -TEST_F(homa_outgoing, homa_message_out_init__multiple_segs_per_skbuff) +#endif /* See strip.py */ +TEST_F(homa_outgoing, homa_message_out_fill__multiple_segs_per_skbuff) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); + ASSERT_FALSE(crpc == NULL); - mock_net_device.gso_max_size = 5000; + self->dev->gso_max_size = 5000; unit_log_clear(); - ASSERT_EQ(0, -homa_message_out_init(crpc, + ASSERT_EQ(0, -homa_message_out_fill(crpc, unit_iov_iter((void *) 1000, 10000), 0)); homa_rpc_unlock(crpc); EXPECT_SUBSTR("_copy_from_iter 1400 bytes at 1000; " @@ -321,65 +552,107 @@ TEST_F(homa_outgoing, homa_message_out_init__multiple_segs_per_skbuff) "DATA 1400@4200 1400@5600 1400@7000; " "DATA 1400@8400 200@9800", unit_log_get()); + EXPECT_EQ(4200, homa_get_skb_info(crpc->msgout.packets)->data_bytes); } -TEST_F(homa_outgoing, homa_message_out_init__add_to_throttled) +TEST_F(homa_outgoing, homa_message_out_fill__error_in_homa_tx_data_packet_alloc) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); + ASSERT_FALSE(crpc == NULL); - ASSERT_EQ(0, -homa_message_out_init(crpc, + mock_set_ipv6(&self->hsk); + mock_copy_data_errors = 2; + + EXPECT_EQ(EFAULT, -homa_message_out_fill(crpc, + unit_iov_iter((void *) 1000, 3000), 0)); + EXPECT_STREQ("couldn't copy message body into packet buffers", + self->hsk.error_msg); + homa_rpc_unlock(crpc); + EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); + EXPECT_EQ(1, crpc->msgout.num_skbs); + EXPECT_EQ(true_size(1400), crpc->msgout.skb_memory); + EXPECT_EQ(true_size(1400) + 1, + refcount_read(&self->hsk.sock.sk_wmem_alloc)); +} +TEST_F(homa_outgoing, homa_message_out_fill__rpc_freed_during_copy) +{ + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, + &self->server_addr); + + ASSERT_FALSE(crpc == NULL); + unit_hook_register(unlock_hook); + hook_rpc = crpc; + ASSERT_EQ(EINVAL, -homa_message_out_fill(crpc, + unit_iov_iter((void *) 1000, 3000), 0)); + EXPECT_STREQ("rpc deleted while creating outgoing message", self->hsk.error_msg); + EXPECT_EQ(0, crpc->msgout.num_skbs); + EXPECT_EQ(RPC_DEAD, crpc->state); + EXPECT_EQ(0, crpc->msgout.skb_memory); + EXPECT_EQ(1, refcount_read(&self->hsk.sock.sk_wmem_alloc)); + homa_rpc_unlock(crpc); +} +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_outgoing, homa_message_out_fill__xmit_packets) +{ + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, + &self->server_addr); + + ASSERT_FALSE(crpc == NULL); + self->homa.unsched_bytes = 2800; + ASSERT_EQ(0, -homa_message_out_fill(crpc, unit_iov_iter((void *) 1000, 5000), 1)); homa_rpc_unlock(crpc); - unit_log_clear(); - unit_log_filled_skbs(crpc->msgout.packets, 0); - EXPECT_STREQ("DATA 1400@0; DATA 1400@1400; DATA 1400@2800; " - "DATA 800@4200", - unit_log_get()); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 2, next_offset 0", - unit_log_get()); + EXPECT_SUBSTR(" _copy_from_iter 1400 bytes at 1000; " + "xmit DATA 1400@0; " + "_copy_from_iter 1400 bytes at 2400; " + "xmit DATA 1400@1400; " + "_copy_from_iter 1400 bytes at 3800; " + "_copy_from_iter 800 bytes at 5200", unit_log_get()); } -TEST_F(homa_outgoing, homa_message_out_init__too_short_for_pipelining) +#endif /* See strip.py */ +TEST_F(homa_outgoing, homa_message_out_fill__packet_memory_accounting) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); + + mock_set_ipv6(&self->hsk); + ASSERT_FALSE(crpc == NULL); - ASSERT_EQ(0, -homa_message_out_init(crpc, - unit_iov_iter((void *) 1000, 1000), 1)); + ASSERT_EQ(0, -homa_message_out_fill(crpc, + unit_iov_iter((void *) 1000, 3000), 0)); homa_rpc_unlock(crpc); - EXPECT_SUBSTR("xmit DATA 1000@0", unit_log_get()); unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(3, crpc->msgout.num_skbs); + EXPECT_EQ(2 * true_size(1400) + true_size(200), + crpc->msgout.skb_memory); + EXPECT_EQ(2 * true_size(1400) + true_size(200) + 1, + refcount_read(&self->hsk.sock.sk_wmem_alloc)); } TEST_F(homa_outgoing, homa_xmit_control__server_request) { + struct homa_busy_hdr h; struct homa_rpc *srpc; - struct grant_header h; - homa_sock_bind(&self->homa.port_map, &self->hsk, self->server_port); + homa_sock_bind(self->hnet, &self->hsk, self->server_port); srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); unit_log_clear(); - h.offset = htonl(12345); - h.priority = 4; - h.common.sender_id = cpu_to_be64(self->client_id); mock_xmit_log_verbose = 1; - EXPECT_EQ(0, homa_xmit_control(GRANT, &h, sizeof(h), srpc)); - EXPECT_STREQ("xmit GRANT from 0.0.0.0:99, dport 40000, id 1235, " - "offset 12345, grant_prio 4", + EXPECT_EQ(0, homa_xmit_control(BUSY, &h, sizeof(h), srpc)); + EXPECT_STREQ("xmit BUSY from 0.0.0.0:99, dport 40000, id 1235", unit_log_get()); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("7", mock_xmit_prios); +#endif /* See strip.py */ } TEST_F(homa_outgoing, homa_xmit_control__client_response) { + struct homa_busy_hdr h; struct homa_rpc *crpc; - struct grant_header h; crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, @@ -387,29 +660,26 @@ TEST_F(homa_outgoing, homa_xmit_control__client_response) ASSERT_NE(NULL, crpc); unit_log_clear(); - h.offset = htonl(12345); - h.priority = 4; mock_xmit_log_verbose = 1; - EXPECT_EQ(0, homa_xmit_control(GRANT, &h, sizeof(h), crpc)); - EXPECT_STREQ("xmit GRANT from 0.0.0.0:40000, dport 99, id 1234, " - "offset 12345, grant_prio 4", + EXPECT_EQ(0, homa_xmit_control(BUSY, &h, sizeof(h), crpc)); + EXPECT_STREQ("xmit BUSY from 0.0.0.0:40000, dport 99, id 1234", unit_log_get()); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("7", mock_xmit_prios); +#endif /* See strip.py */ } TEST_F(homa_outgoing, __homa_xmit_control__cant_alloc_skb) { + struct homa_busy_hdr h; struct homa_rpc *srpc; - struct grant_header h; srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, 1111, 10000, 10000); ASSERT_NE(NULL, srpc); unit_log_clear(); - h.common.type = GRANT; - h.offset = htonl(12345); - h.priority = 4; + h.common.type = BUSY; mock_xmit_log_verbose = 1; mock_alloc_skb_errors = 1; EXPECT_EQ(ENOBUFS, -__homa_xmit_control(&h, sizeof(h), srpc->peer, @@ -419,7 +689,7 @@ TEST_F(homa_outgoing, __homa_xmit_control__cant_alloc_skb) TEST_F(homa_outgoing, __homa_xmit_control__pad_packet) { struct homa_rpc *srpc; - struct busy_header h; + struct homa_busy_hdr h; srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, 1111, 10000, 10000); @@ -430,15 +700,16 @@ TEST_F(homa_outgoing, __homa_xmit_control__pad_packet) "xmit unknown packet type 0x0", unit_log_get()); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_outgoing, __homa_xmit_control__ipv4_error) { + struct homa_grant_hdr h; struct homa_rpc *srpc; - struct grant_header h; // Make sure the test uses IPv4. mock_ipv6 = false; - homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, self->client_port); + unit_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, self->hnet, self->client_port); srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, 1111, 10000, 10000); @@ -451,17 +722,17 @@ TEST_F(homa_outgoing, __homa_xmit_control__ipv4_error) mock_ip_queue_xmit_errors = 1; EXPECT_EQ(ENETDOWN, -homa_xmit_control(GRANT, &h, sizeof(h), srpc)); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.control_xmit_errors); + EXPECT_EQ(1, homa_metrics_per_cpu()->control_xmit_errors); } TEST_F(homa_outgoing, __homa_xmit_control__ipv6_error) { + struct homa_grant_hdr h; struct homa_rpc *srpc; - struct grant_header h; // Make sure the test uses IPv6. mock_ipv6 = true; - homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, self->client_port); + unit_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, self->hnet, self->client_port); srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, 1111, 10000, 10000); @@ -474,39 +745,49 @@ TEST_F(homa_outgoing, __homa_xmit_control__ipv6_error) mock_ip6_xmit_errors = 1; EXPECT_EQ(ENETDOWN, -homa_xmit_control(GRANT, &h, sizeof(h), srpc)); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.control_xmit_errors); + EXPECT_EQ(1, homa_metrics_per_cpu()->control_xmit_errors); } TEST_F(homa_outgoing, homa_xmit_unknown) { - struct sk_buff *skb; - struct grant_header h = {{.sport = htons(self->client_port), - .dport = htons(self->server_port), + struct homa_grant_hdr h = {{.sport = htons(self->client_port), + .dport = htons(self->server_port), .sender_id = cpu_to_be64(99990), .type = GRANT}, - .offset = htonl(11200), - .priority = 3}; + .offset = htonl(11200)}; + struct sk_buff *skb; + mock_xmit_log_verbose = 1; - skb = mock_skb_new(self->client_ip, &h.common, 0, 0); + skb = mock_skb_alloc(self->client_ip, &h.common, 0, 0); homa_xmit_unknown(skb, &self->hsk); - EXPECT_STREQ("xmit UNKNOWN from 0.0.0.0:99, dport 40000, id 99991", + EXPECT_STREQ("xmit RPC_UNKNOWN from 0.0.0.0:99, dport 40000, id 99991", unit_log_get()); kfree_skb(skb); } +#endif /* See strip.py */ TEST_F(homa_outgoing, homa_xmit_data__basics) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 6000, 1000); + +#ifndef __STRIP__ /* See strip.py */ crpc->msgout.sched_priority = 2; crpc->msgout.unscheduled = 2000; crpc->msgout.granted = 5000; homa_peer_set_cutoffs(crpc->peer, INT_MAX, 0, 0, 0, 0, INT_MAX, 7000, 0); +#else /* See strip.py */ + unit_reset_tx(crpc); +#endif /* See strip.py */ + unit_log_clear(); mock_clear_xmit_prios(); - homa_xmit_data(crpc, false); + homa_rpc_lock(crpc); + XMIT_DATA(crpc, false); + homa_rpc_unlock(crpc); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit DATA 1400@0; " "xmit DATA 1400@1400; " "xmit DATA 1400@2800; " @@ -516,16 +797,27 @@ TEST_F(homa_outgoing, homa_xmit_data__basics) unit_log_clear(); unit_log_throttled(&self->homa); EXPECT_STREQ("", unit_log_get()); +#else /* See strip.py */ + EXPECT_STREQ("xmit DATA 1400@0; " + "xmit DATA 1400@1400; " + "xmit DATA 1400@2800; " + "xmit DATA 1400@4200; " + "xmit DATA 400@5600", unit_log_get()); + EXPECT_EQ(6000, crpc->msgout.next_xmit_offset); +#endif /* See strip.py */ } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_outgoing, homa_xmit_data__stop_because_no_more_granted) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 6000, 1000); - unit_log_clear(); + unit_log_clear(); crpc->msgout.granted = 1000; - homa_xmit_data(crpc, false); + homa_rpc_lock(crpc); + XMIT_DATA(crpc, false); + homa_rpc_unlock(crpc); EXPECT_STREQ("xmit DATA 1400@0", unit_log_get()); unit_log_clear(); unit_log_throttled(&self->homa); @@ -536,11 +828,15 @@ TEST_F(homa_outgoing, homa_xmit_data__below_throttle_min) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 200, 1000); + unit_log_clear(); - atomic64_set(&self->homa.link_idle_time, 11000); - self->homa.max_nic_queue_cycles = 500; + atomic64_set(&self->homa.pacer->link_idle_time, 11000); + self->homa.qshared->max_nic_est_backlog_cycles = 500; + self->homa.qshared->defer_min_bytes = 250; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - homa_xmit_data(crpc, false); + homa_rpc_lock(crpc); + XMIT_DATA(crpc, false); + homa_rpc_unlock(crpc); EXPECT_STREQ("xmit DATA 200@0", unit_log_get()); unit_log_clear(); unit_log_throttled(&self->homa); @@ -556,62 +852,140 @@ TEST_F(homa_outgoing, homa_xmit_data__force) self->server_port, self->client_id+2, 5000, 1000); /* First, get an RPC on the throttled list. */ - atomic64_set(&self->homa.link_idle_time, 11000); - self->homa.max_nic_queue_cycles = 3000; + atomic64_set(&self->homa.pacer->link_idle_time, 11000); + self->homa.qshared->max_nic_est_backlog_cycles = 3000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - homa_xmit_data(crpc1, false); + homa_rpc_lock(crpc1); + XMIT_DATA(crpc1, false); + homa_rpc_unlock(crpc1); unit_log_clear(); unit_log_throttled(&self->homa); EXPECT_STREQ("request id 1234, next_offset 2800", unit_log_get()); /* Now force transmission. */ unit_log_clear(); - homa_xmit_data(crpc2, true); - EXPECT_STREQ("xmit DATA 1400@0; wake_up_process pid -1", - unit_log_get()); + homa_rpc_lock(crpc2); + XMIT_DATA(crpc2, true); + homa_rpc_unlock(crpc2); + EXPECT_STREQ("xmit DATA 1400@0", unit_log_get()); unit_log_clear(); unit_log_throttled(&self->homa); EXPECT_STREQ("request id 1234, next_offset 2800; " "request id 1236, next_offset 1400", unit_log_get()); } +TEST_F(homa_outgoing, homa_xmit_data__dont_throttle_because_homa_qdisc_in_use) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *crpc; + + qdev = homa_qdisc_qdev_get(self->dev); + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 2000, 1000); + unit_log_clear(); + atomic64_set(&self->homa.pacer->link_idle_time, 1000000); + self->homa.qshared->max_nic_est_backlog_cycles = 0; + self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; + + homa_rpc_lock(crpc); + XMIT_DATA(crpc, false); + homa_rpc_unlock(crpc); + EXPECT_STREQ("xmit DATA 1400@0; xmit DATA 600@1400", unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("", unit_log_get()); + homa_qdisc_qdev_put(qdev); +} TEST_F(homa_outgoing, homa_xmit_data__throttle) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 6000, 1000); + unit_log_clear(); - atomic64_set(&self->homa.link_idle_time, 11000); - self->homa.max_nic_queue_cycles = 3000; + atomic64_set(&self->homa.pacer->link_idle_time, 11000); + self->homa.qshared->max_nic_est_backlog_cycles = 3000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - homa_xmit_data(crpc, false); + homa_rpc_lock(crpc); + XMIT_DATA(crpc, false); + homa_rpc_unlock(crpc); EXPECT_STREQ("xmit DATA 1400@0; " - "xmit DATA 1400@1400; " - "wake_up_process pid -1", unit_log_get()); + "xmit DATA 1400@1400", unit_log_get()); unit_log_clear(); unit_log_throttled(&self->homa); EXPECT_STREQ("request id 1234, next_offset 2800", unit_log_get()); } -TEST_F(homa_outgoing, homa_xmit_data__update_next_xmit_offset) +TEST_F(homa_outgoing, homa_xmit_data__metrics_for_client_rpc) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 6000, 1000); - unit_log_clear(); - crpc->msgout.granted = 3000; - homa_xmit_data(crpc, false); - EXPECT_EQ(4200, crpc->msgout.next_xmit_offset); + crpc->msgout.granted = 4000; + homa_rpc_lock(crpc); + XMIT_DATA(crpc, false); + EXPECT_EQ(4200, homa_metrics_per_cpu()->client_request_bytes_done); + EXPECT_EQ(0, homa_metrics_per_cpu()->client_requests_done); + crpc->msgout.granted = 6000; - homa_xmit_data(crpc, false); - EXPECT_EQ(6000, crpc->msgout.next_xmit_offset); + XMIT_DATA(crpc, false); + EXPECT_EQ(6000, homa_metrics_per_cpu()->client_request_bytes_done); + EXPECT_EQ(1, homa_metrics_per_cpu()->client_requests_done); + homa_rpc_unlock(crpc); } +TEST_F(homa_outgoing, homa_xmit_data__metrics_for_server_rpc) +{ + struct homa_rpc *srpc; + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->client_port, + self->server_id, 1000, 10000); + + srpc->msgout.granted = 4000; + homa_rpc_lock(srpc); + XMIT_DATA(srpc, false); + EXPECT_EQ(4200, homa_metrics_per_cpu()->server_response_bytes_done); + EXPECT_EQ(0, homa_metrics_per_cpu()->server_responses_done); + + srpc->msgout.granted = 9900; + XMIT_DATA(srpc, false); + EXPECT_EQ(10000, homa_metrics_per_cpu()->server_response_bytes_done); + EXPECT_EQ(1, homa_metrics_per_cpu()->server_responses_done); + homa_rpc_unlock(srpc); +} +#endif /* See strip.py */ +TEST_F(homa_outgoing, homa_xmit_data__rpc_freed) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 6000, 1000); + +#ifndef __STRIP__ /* See strip.py */ + crpc->msgout.unscheduled = 2000; + crpc->msgout.granted = 5000; +#else /* See strip.py */ + unit_reset_tx(crpc); +#endif /* See strip.py */ + + unit_log_clear(); + homa_rpc_lock(crpc); + unit_hook_register(lock_free_hook); + hook_rpc = crpc; + XMIT_DATA(crpc, false); + homa_rpc_unlock(crpc); + EXPECT_STREQ("xmit DATA 1400@0; homa_rpc_end invoked", + unit_log_get()); + EXPECT_EQ(1400, crpc->msgout.next_xmit_offset); +} + +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_outgoing, __homa_xmit_data__update_cutoff_version) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 1000, 1000); + crpc->peer->cutoff_version = htons(123); mock_xmit_log_verbose = 1; unit_log_clear(); @@ -619,77 +993,101 @@ TEST_F(homa_outgoing, __homa_xmit_data__update_cutoff_version) __homa_xmit_data(crpc->msgout.packets, crpc, 4); EXPECT_SUBSTR("cutoff_version 123", unit_log_get()); } +#endif /* See strip.py */ TEST_F(homa_outgoing, __homa_xmit_data__fill_dst) { - int old_refcount; - struct dst_entry *dst; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 1000, 1000); + struct dst_entry *dst; + int old_refcount; + +#ifdef __STRIP__ /* See strip.py */ + unit_reset_tx(crpc); +#endif /* See strip.py */ unit_log_clear(); dst = crpc->peer->dst; - old_refcount = dst->__refcnt.counter; + old_refcount = atomic_read(&dst->__rcuref.refcnt); skb_get(crpc->msgout.packets); +#ifndef __STRIP__ /* See strip.py */ __homa_xmit_data(crpc->msgout.packets, crpc, 6); +#else /* See strip.py */ + __homa_xmit_data(crpc->msgout.packets, crpc); +#endif /* See strip.py */ EXPECT_STREQ("xmit DATA 1000@0", unit_log_get()); EXPECT_EQ(dst, skb_dst(crpc->msgout.packets)); - EXPECT_EQ(old_refcount+1, dst->__refcnt.counter); + EXPECT_EQ(old_refcount+1, atomic_read(&dst->__rcuref.refcnt)); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_outgoing, __homa_xmit_data__ipv4_transmit_error) { + struct homa_rpc *crpc; + // Make sure the test uses IPv4. mock_ipv6 = false; - homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, self->client_port); + unit_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, self->hnet, self->client_port); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 1000); + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 1000, 1000); unit_log_clear(); mock_ip_queue_xmit_errors = 1; skb_get(crpc->msgout.packets); __homa_xmit_data(crpc->msgout.packets, crpc, 5); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.data_xmit_errors); + EXPECT_EQ(1, homa_metrics_per_cpu()->data_xmit_errors); } TEST_F(homa_outgoing, __homa_xmit_data__ipv6_transmit_error) { + struct homa_rpc *crpc; + // Make sure the test uses IPv6. mock_ipv6 = true; - homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, self->client_port); + unit_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, self->hnet, self->client_port); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 100, 1000); + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 100, 1000); unit_log_clear(); mock_ip6_xmit_errors = 1; skb_get(crpc->msgout.packets); __homa_xmit_data(crpc->msgout.packets, crpc, 5); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.data_xmit_errors); + EXPECT_EQ(1, homa_metrics_per_cpu()->data_xmit_errors); } +#endif /* See strip.py */ TEST_F(homa_outgoing, homa_resend_data__basics) { - mock_net_device.gso_max_size = 5000; - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 16000, 1000); + struct homa_rpc *crpc; + + self->dev->gso_max_size = 5000; + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 16000, 1000); unit_log_clear(); mock_clear_xmit_prios(); mock_xmit_log_verbose = 1; + + /* Helps to detect errors in computing seg_offset. */ + skb_push(crpc->msgout.packets, 8); + homa_resend_data(crpc, 7000, 10000, 2); - EXPECT_STREQ("xmit DATA from 0.0.0.0:40000, dport 99, id 1234, " - "message_length 16000, offset 7000, data_length 1400, " - "incoming 12600, RETRANSMIT; " - "xmit DATA from 0.0.0.0:40000, dport 99, id 1234, " - "message_length 16000, offset 8400, data_length 1400, " - "incoming 12600, RETRANSMIT; " - "xmit DATA from 0.0.0.0:40000, dport 99, id 1234, " - "message_length 16000, offset 9800, data_length 1400, " - "incoming 12600, RETRANSMIT", +#ifndef __STRIP__ /* See strip.py */ + EXPECT_STREQ("xmit DATA from 0.0.0.0:40000, dport 99, id 1234, message_length 16000, offset 7000, data_length 1400, incoming 10000, RETRANSMIT; " + "xmit DATA from 0.0.0.0:40000, dport 99, id 1234, message_length 16000, offset 8400, data_length 1400, incoming 10000, RETRANSMIT; " + "xmit DATA from 0.0.0.0:40000, dport 99, id 1234, message_length 16000, offset 9800, data_length 200, incoming 10000, RETRANSMIT", unit_log_get()); +#else /* See strip.py */ + EXPECT_STREQ("xmit DATA from 0.0.0.0:40000, dport 99, id 1234, message_length 16000, offset 7000, data_length 1400, RETRANSMIT; " + "xmit DATA from 0.0.0.0:40000, dport 99, id 1234, message_length 16000, offset 8400, data_length 1400, RETRANSMIT; " + "xmit DATA from 0.0.0.0:40000, dport 99, id 1234, message_length 16000, offset 9800, data_length 1400, RETRANSMIT", + unit_log_get()); +#endif /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("2 2 2", mock_xmit_prios); +#endif /* See strip.py */ unit_log_clear(); mock_clear_xmit_prios(); @@ -702,7 +1100,9 @@ TEST_F(homa_outgoing, homa_resend_data__basics) mock_xmit_log_verbose = 0; homa_resend_data(crpc, 2800, 4200, 3); EXPECT_STREQ("xmit DATA retrans 1400@2800", unit_log_get()); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("3", mock_xmit_prios); +#endif /* See strip.py */ unit_log_clear(); mock_clear_xmit_prios(); @@ -710,428 +1110,135 @@ TEST_F(homa_outgoing, homa_resend_data__basics) homa_resend_data(crpc, 4199, 4201, 7); EXPECT_STREQ("xmit DATA retrans 1400@2800; " "xmit DATA retrans 1400@4200", unit_log_get()); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("7 7", mock_xmit_prios); +#endif /* See strip.py */ unit_log_clear(); mock_xmit_log_verbose = 0; homa_resend_data(crpc, 16000, 17000, 7); EXPECT_STREQ("", unit_log_get()); } -TEST_F(homa_outgoing, homa_resend_data__set_incoming) +TEST_F(homa_outgoing, homa_resend_data__packet_doesnt_use_gso) { - mock_net_device.gso_max_size = 5000; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 16000, 1000); - unit_log_clear(); - mock_xmit_log_verbose = 1; - EXPECT_EQ(12600, crpc->msgout.granted); - homa_resend_data(crpc, 8400, 8800, 2); - EXPECT_SUBSTR("incoming 12600", unit_log_get()); - - unit_log_clear(); - homa_resend_data(crpc, 12700, 13000, 2); - EXPECT_SUBSTR("incoming 14000", unit_log_get()); + self->server_port, self->client_id, 1000, 2000); unit_log_clear(); - homa_resend_data(crpc, 15500, 16500, 2); - EXPECT_SUBSTR("incoming 16000", unit_log_get()); + homa_resend_data(crpc, 500, 1500, 2); + EXPECT_STREQ("xmit DATA retrans 1000@0", unit_log_get()); } -TEST_F(homa_outgoing, homa_resend_data__advance_next_xmit) +TEST_F(homa_outgoing, homa_resend_data__cant_allocate_skb) { - char buffer[1000]; - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 4500, 1000); - unit_log_clear(); - mock_clear_xmit_prios(); - - /* First resend ends just short of a full packet. */ - homa_resend_data(crpc, 2000, 2799, 2); - EXPECT_EQ(1400, crpc->msgout.next_xmit_offset); - homa_print_packet(*crpc->msgout.next_xmit, buffer, sizeof(buffer)); - EXPECT_SUBSTR("offset 1400", buffer); - - /* Second resend ends on a packet boundary. */ - homa_resend_data(crpc, 2000, 4200, 2); - EXPECT_EQ(4200, crpc->msgout.next_xmit_offset); - homa_print_packet(*crpc->msgout.next_xmit, buffer, sizeof(buffer)); - EXPECT_SUBSTR("offset 4200", buffer); - - /* Third resend ends just before message end. */ - homa_resend_data(crpc, 2000, 4499, 2); - EXPECT_EQ(4200, crpc->msgout.next_xmit_offset); - homa_print_packet(*crpc->msgout.next_xmit, buffer, sizeof(buffer)); - EXPECT_SUBSTR("offset 4200", buffer); - - /* Fourth resend covers entire message. */ - homa_resend_data(crpc, 2000, 4500, 2); - EXPECT_EQ(4500, crpc->msgout.next_xmit_offset); - homa_print_packet(*crpc->msgout.next_xmit, buffer, sizeof(buffer)); - EXPECT_STREQ("skb is NULL!", buffer); -} + struct homa_rpc *crpc; -TEST_F(homa_outgoing, homa_outgoing_sysctl_changed) -{ - self->homa.link_mbps = 10000; - homa_outgoing_sysctl_changed(&self->homa); - EXPECT_EQ(808, self->homa.cycles_per_kbyte); - - self->homa.link_mbps = 1000; - homa_outgoing_sysctl_changed(&self->homa); - EXPECT_EQ(8080, self->homa.cycles_per_kbyte); - - self->homa.link_mbps = 40000; - homa_outgoing_sysctl_changed(&self->homa); - EXPECT_EQ(202, self->homa.cycles_per_kbyte); - - self->homa.max_nic_queue_ns = 200; - cpu_khz = 2000000; - homa_outgoing_sysctl_changed(&self->homa); - EXPECT_EQ(400, self->homa.max_nic_queue_cycles); -} + self->dev->gso_max_size = 5000; + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 16000, 1000); -TEST_F(homa_outgoing, homa_check_nic_queue__basics) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 500, 1000); - homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; - unit_log_clear(); - atomic64_set(&self->homa.link_idle_time, 9000); - mock_cycles = 8000; - self->homa.max_nic_queue_cycles = 1000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - EXPECT_EQ(1, homa_check_nic_queue(&self->homa, crpc->msgout.packets, - false)); - EXPECT_EQ(9500, atomic64_read(&self->homa.link_idle_time)); -} -TEST_F(homa_outgoing, homa_check_nic_queue__queue_full) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 500, 1000); - homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; unit_log_clear(); - atomic64_set(&self->homa.link_idle_time, 9000); - mock_cycles = 7999; - self->homa.max_nic_queue_cycles = 1000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - EXPECT_EQ(0, homa_check_nic_queue(&self->homa, crpc->msgout.packets, - false)); - EXPECT_EQ(9000, atomic64_read(&self->homa.link_idle_time)); -} -TEST_F(homa_outgoing, homa_check_nic_queue__queue_full_but_force) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 500, 1000); - homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; - unit_log_clear(); - atomic64_set(&self->homa.link_idle_time, 9000); - mock_cycles = 7999; - self->homa.max_nic_queue_cycles = 1000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - EXPECT_EQ(1, homa_check_nic_queue(&self->homa, crpc->msgout.packets, - true)); - EXPECT_EQ(9500, atomic64_read(&self->homa.link_idle_time)); -} -TEST_F(homa_outgoing, homa_check_nic_queue__pacer_metrics) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 500, 1000); - homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; - homa_add_to_throttled(crpc); - unit_log_clear(); - atomic64_set(&self->homa.link_idle_time, 9000); - self->homa.pacer_wake_time = 9800; - mock_cycles = 10000; - self->homa.max_nic_queue_cycles = 1000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - EXPECT_EQ(1, homa_check_nic_queue(&self->homa, crpc->msgout.packets, - true)); - EXPECT_EQ(10500, atomic64_read(&self->homa.link_idle_time)); - EXPECT_EQ(500, homa_cores[cpu_number]->metrics.pacer_bytes); - EXPECT_EQ(200, homa_cores[cpu_number]->metrics.pacer_lost_cycles); + mock_clear_xmit_prios(); + mock_alloc_skb_errors = 1; + homa_resend_data(crpc, 7000, 10000, 2); + EXPECT_STREQ("skb allocation error", unit_log_get()); } -TEST_F(homa_outgoing, homa_check_nic_queue__queue_empty) +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_outgoing, homa_resend_data__set_incoming) { - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 500, 1000); - homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; - unit_log_clear(); - atomic64_set(&self->homa.link_idle_time, 9000); - mock_cycles = 10000; - self->homa.max_nic_queue_cycles = 1000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - EXPECT_EQ(1, homa_check_nic_queue(&self->homa, crpc->msgout.packets, - true)); - EXPECT_EQ(10500, atomic64_read(&self->homa.link_idle_time)); -} - -/* Don't know how to unit test homa_pacer_main... */ + struct homa_rpc *crpc; -TEST_F(homa_outgoing, homa_pacer_xmit__basics) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, - 5000, 1000); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, - 10000, 1000); - struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id+4, - 150000, 1000); - homa_add_to_throttled(crpc1); - homa_add_to_throttled(crpc2); - homa_add_to_throttled(crpc3); - self->homa.max_nic_queue_cycles = 2000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - unit_log_clear(); - homa_pacer_xmit(&self->homa); - EXPECT_STREQ("xmit DATA 1400@0; xmit DATA 1400@1400", - unit_log_get()); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 1234, next_offset 2800; " - "request id 1236, next_offset 0; " - "request id 1238, next_offset 0", unit_log_get()); -} -TEST_F(homa_outgoing, homa_pacer_xmit__xmit_fifo) -{ - mock_cycles = 10000; - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, 2, 20000, 1000); - mock_cycles = 11000; - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, 4, 10000, 1000); - mock_cycles = 12000; - struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, 6, 30000, 1000); - homa_add_to_throttled(crpc1); - homa_add_to_throttled(crpc2); - homa_add_to_throttled(crpc3); - - /* First attempt: pacer_fifo_count doesn't reach zero. */ - self->homa.max_nic_queue_cycles = 1300; - self->homa.pacer_fifo_count = 200; - self->homa.pacer_fifo_fraction = 150; - mock_cycles = 13000; - atomic64_set(&self->homa.link_idle_time, 10000); - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; + self->dev->gso_max_size = 5000; + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 16000, 1000); unit_log_clear(); mock_xmit_log_verbose = 1; - homa_pacer_xmit(&self->homa); - EXPECT_SUBSTR("id 4, message_length 10000, offset 0, data_length 1400", - unit_log_get()); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 4, next_offset 1400; " - "request id 2, next_offset 0; " - "request id 6, next_offset 0", unit_log_get()); - EXPECT_EQ(50, self->homa.pacer_fifo_count); - - /* Second attempt: pacer_fifo_count reaches zero. */ - atomic64_set(&self->homa.link_idle_time, 10000); - unit_log_clear(); - homa_pacer_xmit(&self->homa); - EXPECT_SUBSTR("id 2, message_length 20000, offset 0, data_length 1400", - unit_log_get()); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 4, next_offset 1400; " - "request id 2, next_offset 1400; " - "request id 6, next_offset 0", unit_log_get()); - EXPECT_EQ(900, self->homa.pacer_fifo_count); -} -TEST_F(homa_outgoing, homa_pacer_xmit__pacer_busy) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, - 10000, 1000); - homa_add_to_throttled(crpc); - self->homa.max_nic_queue_cycles = 2000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - mock_trylock_errors = 1; - unit_log_clear(); - homa_pacer_xmit(&self->homa); - EXPECT_STREQ("", unit_log_get()); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 1234, next_offset 0", unit_log_get()); -} -TEST_F(homa_outgoing, homa_pacer_xmit__queue_empty) -{ - self->homa.max_nic_queue_cycles = 2000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - unit_log_clear(); - homa_pacer_xmit(&self->homa); - unit_log_throttled(&self->homa); - EXPECT_STREQ("", unit_log_get()); -} -TEST_F(homa_outgoing, homa_pacer_xmit__nic_queue_fills) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, - 10000, 1000); - homa_add_to_throttled(crpc); - self->homa.max_nic_queue_cycles = 2001; - mock_cycles = 10000; - atomic64_set(&self->homa.link_idle_time, 12000); - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - unit_log_clear(); - homa_pacer_xmit(&self->homa); - EXPECT_STREQ("xmit DATA 1400@0", unit_log_get()); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 1234, next_offset 1400", unit_log_get()); -} -TEST_F(homa_outgoing, homa_pacer_xmit__rpc_locked) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, - 5000, 1000); - homa_add_to_throttled(crpc); - self->homa.max_nic_queue_cycles = 2000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - unit_log_clear(); - mock_trylock_errors = ~1; - homa_pacer_xmit(&self->homa); - EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.pacer_skipped_rpcs); - unit_log_clear(); - mock_trylock_errors = 0; - homa_pacer_xmit(&self->homa); - EXPECT_STREQ("xmit DATA 1400@0; xmit DATA 1400@1400", - unit_log_get()); + EXPECT_EQ(10000, crpc->msgout.granted); + homa_resend_data(crpc, 8400, 8800, 2); + EXPECT_SUBSTR("incoming 8800", unit_log_get()); } -TEST_F(homa_outgoing, homa_pacer_xmit__remove_from_queue) +TEST_F(homa_outgoing, homa_resend_data__error_copying_data) { - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, 2, - 1000, 1000); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, 4, - 10000, 1000); - homa_add_to_throttled(crpc1); - homa_add_to_throttled(crpc2); - self->homa.max_nic_queue_cycles = 2000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; + struct homa_rpc *crpc; + + self->dev->gso_max_size = 5000; + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 16000, 1000); unit_log_clear(); - homa_pacer_xmit(&self->homa); - EXPECT_STREQ("xmit DATA 1000@0; xmit DATA 1400@0", + mock_clear_xmit_prios(); + mock_max_skb_frags = 0; + homa_resend_data(crpc, 7000, 10000, 2); + EXPECT_STREQ("homa_resend_data got error 22 while copying data", unit_log_get()); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 4, next_offset 1400", unit_log_get()); - EXPECT_TRUE(list_empty(&crpc1->throttled_links)); } - -/* Don't know how to unit test homa_pacer_stop... */ - -TEST_F(homa_outgoing, homa_add_to_throttled__basics) +#endif /* See strip.py */ +TEST_F(homa_outgoing, homa_resend_data__update_to_free_and_set_homa_info) { - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, 2, 10000, 1000); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, 4, 5000, 1000); - struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, 6, 15000, 1000); - struct homa_rpc *crpc4 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, 8, 12000, 1000); - struct homa_rpc *crpc5 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, 10, 10000, 1000); - - /* Basics: add one RPC. */ - homa_add_to_throttled(crpc1); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 2, next_offset 0", unit_log_get()); + struct homa_skb_info *homa_info; + struct homa_rpc *crpc; + struct sk_buff *skb; - /* Check priority ordering. */ - homa_add_to_throttled(crpc2); - homa_add_to_throttled(crpc3); - homa_add_to_throttled(crpc4); - homa_add_to_throttled(crpc5); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 4, next_offset 0; " - "request id 2, next_offset 0; " - "request id 10, next_offset 0; " - "request id 8, next_offset 0; " - "request id 6, next_offset 0", unit_log_get()); - - /* Don't reinsert if already present. */ - homa_add_to_throttled(crpc1); + mock_set_ipv6(&self->hsk); + self->dev->gso_max_size = 5000; + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 16000, 1000); unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 4, next_offset 0; " - "request id 2, next_offset 0; " - "request id 10, next_offset 0; " - "request id 8, next_offset 0; " - "request id 6, next_offset 0", unit_log_get()); -} -TEST_F(homa_outgoing, homa_add_to_throttled__inc_metrics) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 5000, 1000); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 10000, 1000); - struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id+4, 15000, 1000); - - homa_add_to_throttled(crpc1); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.throttle_list_adds); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.throttle_list_checks); - - homa_add_to_throttled(crpc2); - EXPECT_EQ(2, homa_cores[cpu_number]->metrics.throttle_list_adds); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.throttle_list_checks); - - homa_add_to_throttled(crpc3); - EXPECT_EQ(3, homa_cores[cpu_number]->metrics.throttle_list_adds); - EXPECT_EQ(3, homa_cores[cpu_number]->metrics.throttle_list_checks); + homa_resend_data(crpc, 8400, 8800, 2); + skb = crpc->msgout.to_free; + ASSERT_NE(NULL, skb); + homa_info = homa_get_skb_info(skb); + EXPECT_EQ(NULL, homa_info->next_skb); + EXPECT_EQ(1538, homa_info->wire_bytes); + EXPECT_EQ(1400, homa_info->data_bytes); + EXPECT_EQ(1400, homa_info->seg_length); + EXPECT_EQ(8400, homa_info->offset); + EXPECT_EQ(crpc, homa_info->rpc); + EXPECT_EQ(1, refcount_read(&skb->users)); + IF_NO_STRIP(EXPECT_EQ(6, crpc->msgout.num_skbs)); } -TEST_F(homa_outgoing, homa_remove_from_throttled) +TEST_F(homa_outgoing, homa_rpc_tx_end) { - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 5000, 1000); - - homa_add_to_throttled(crpc); - EXPECT_FALSE(list_empty(&self->homa.throttled_rpcs)); - - // First attempt will remove. - unit_log_clear(); - homa_remove_from_throttled(crpc); - EXPECT_TRUE(list_empty(&self->homa.throttled_rpcs)); - EXPECT_STREQ("removing id 1234 from throttled list", unit_log_get()); - - // Second attempt: nothing to do. - unit_log_clear(); - homa_remove_from_throttled(crpc); - EXPECT_TRUE(list_empty(&self->homa.throttled_rpcs)); - EXPECT_STREQ("", unit_log_get()); -} + struct homa_rpc *crpc; + struct sk_buff *skbs[5]; + struct sk_buff *skb; + int i; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 6000, 1000); + ASSERT_EQ(5, crpc->msgout.num_skbs); + + /* First call: no packets passed to IP stack. */ + crpc->msgout.next_xmit_offset = 0; + EXPECT_EQ(0, homa_rpc_tx_end(crpc)); + + for (skb = crpc->msgout.packets, i = 0; skb != NULL; + skb = homa_get_skb_info(skb)->next_skb, i++) { + skbs[i] = skb; + skb_get(skb); + EXPECT_EQ(2, refcount_read(&skbs[i]->users)); + } + EXPECT_EQ(5, i); + + /* Second call: all packets passed to IP, but no packets complete. */ + crpc->msgout.next_xmit_offset = 6000; + EXPECT_EQ(0, homa_rpc_tx_end(crpc)); + + /* Third call: packets 0 and 3 transmitted. */ + kfree_skb(skbs[0]); + kfree_skb(skbs[3]); + EXPECT_EQ(1400, homa_rpc_tx_end(crpc)); + EXPECT_EQ(skbs[1], crpc->msgout.first_not_tx); + + /* Fourth call: all packets transmitted. */ + kfree_skb(skbs[1]); + kfree_skb(skbs[2]); + kfree_skb(skbs[4]); + EXPECT_EQ(6000, homa_rpc_tx_end(crpc)); + EXPECT_EQ(NULL, crpc->msgout.first_not_tx); +} \ No newline at end of file diff --git a/test/unit_homa_pacer.c b/test/unit_homa_pacer.c new file mode 100644 index 00000000..02e01b2d --- /dev/null +++ b/test/unit_homa_pacer.c @@ -0,0 +1,607 @@ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +#include "homa_impl.h" +#include "homa_pacer.h" +#include "homa_rpc.h" +#define KSELFTEST_NOT_MAIN 1 +#include "kselftest_harness.h" +#include "ccutils.h" +#include "mock.h" +#include "utils.h" + +static struct homa_rpc *hook_rpc; +static int hook_count; +static void unmanage_hook(char *id) { + if (strcmp(id, "spin_lock") != 0) + return; + if (hook_count <= 0) + return; + hook_count--; + if (hook_count == 0) + homa_pacer_unmanage_rpc(hook_rpc); +} + +static u64 hook_exit_count; +static void exit_hook(char *id) { + if (strcmp(id, "prepare_to_wait") != 0) + return; + if (hook_exit_count > 0) { + hook_exit_count--; + if (hook_exit_count == 0) + mock_exit_thread = true; + } +} + +static struct homa_pacer *hook_pacer; +static void exit_idle_hook(char *id) { + if (strcmp(id, "schedule") == 0) + unit_log_printf("; ", "time %llu", mock_clock); + if (list_empty(&hook_pacer->throttled_rpcs)) + mock_exit_thread = true; +} + +static void manage_hook(char *id) +{ + if (strcmp(id, "prepare_to_wait") == 0 && hook_rpc) { + homa_pacer_manage_rpc(hook_rpc); + hook_rpc = NULL; + } +} + +FIXTURE(homa_pacer) { + struct in6_addr client_ip[1]; + int client_port; + struct in6_addr server_ip[1]; + int server_port; + u64 client_id; + u64 server_id; + struct homa homa; + struct homa_net *hnet; + struct homa_sock hsk; +}; +FIXTURE_SETUP(homa_pacer) +{ + self->client_ip[0] = unit_get_in_addr("196.168.0.1"); + self->client_port = 40000; + self->server_ip[0] = unit_get_in_addr("1.2.3.4"); + self->server_port = 99; + self->client_id = 1234; + self->server_id = 1235; + homa_init(&self->homa); + self->hnet = mock_hnet(0, &self->homa); + self->homa.pacer->cycles_per_mbyte = 1000000; + self->homa.qshared->defer_min_bytes = 0; + self->homa.qshared->fifo_fraction = 0; + mock_sock_init(&self->hsk, self->hnet, self->client_port); +} +FIXTURE_TEARDOWN(homa_pacer) +{ + homa_destroy(&self->homa); + unit_teardown(); +} + +TEST_F(homa_pacer, homa_pacer_new__success) +{ + struct homa_pacer *pacer; + + pacer = homa_pacer_alloc(&self->homa); + EXPECT_FALSE(IS_ERR(pacer)); + EXPECT_EQ(&self->homa, pacer->homa); + homa_pacer_free(pacer); +} +TEST_F(homa_pacer, homa_pacer_new__cant_allocate_memory) +{ + struct homa_pacer *pacer; + + mock_kmalloc_errors = 1; + pacer = homa_pacer_alloc(&self->homa); + EXPECT_TRUE(IS_ERR(pacer)); + EXPECT_EQ(ENOMEM, -PTR_ERR(pacer)); +} +TEST_F(homa_pacer, homa_pacer_new__cant_create_pacer_thread) +{ + struct homa_pacer *pacer; + + mock_kthread_create_errors = 1; + pacer = homa_pacer_alloc(&self->homa); + EXPECT_TRUE(IS_ERR(pacer)); + EXPECT_EQ(EACCES, -PTR_ERR(pacer)); +} + +TEST_F(homa_pacer, homa_pacer_free__basics) +{ + struct homa_pacer *pacer; + + pacer = homa_pacer_alloc(&self->homa); + EXPECT_FALSE(IS_ERR(pacer)); + unit_log_clear(); + homa_pacer_free(pacer); + EXPECT_STREQ("kthread_stop", unit_log_get()); +} +TEST_F(homa_pacer, homa_pacer_free__no_thread) +{ + struct homa_pacer *pacer; + + pacer = homa_pacer_alloc(&self->homa); + EXPECT_FALSE(IS_ERR(pacer)); + pacer->kthread = NULL; + unit_log_clear(); + homa_pacer_free(pacer); + EXPECT_STREQ("", unit_log_get()); +} + +TEST_F(homa_pacer, homa_pacer_check_nic_q__success) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 500, 1000); + + homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; + unit_log_clear(); + atomic64_set(&self->homa.pacer->link_idle_time, 9000); + mock_clock = 8000; + self->homa.qshared->max_nic_est_backlog_cycles = 1000; + EXPECT_EQ(1, homa_pacer_check_nic_q(self->homa.pacer, + crpc->msgout.packets, false)); + EXPECT_EQ(9500, atomic64_read(&self->homa.pacer->link_idle_time)); +} +TEST_F(homa_pacer, homa_pacer_check_nic_q__queue_full) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 500, 1000); + + homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; + unit_log_clear(); + atomic64_set(&self->homa.pacer->link_idle_time, 9000); + mock_clock = 7999; + self->homa.qshared->max_nic_est_backlog_cycles = 1000; + EXPECT_EQ(0, homa_pacer_check_nic_q(self->homa.pacer, + crpc->msgout.packets, false)); + EXPECT_EQ(9000, atomic64_read(&self->homa.pacer->link_idle_time)); +} +TEST_F(homa_pacer, homa_pacer_check_nic_q__queue_full_but_force) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 500, 1000); + + homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; + unit_log_clear(); + atomic64_set(&self->homa.pacer->link_idle_time, 9000); + mock_clock = 7999; + self->homa.qshared->max_nic_est_backlog_cycles = 1000; + EXPECT_EQ(1, homa_pacer_check_nic_q(self->homa.pacer, + crpc->msgout.packets, true)); + EXPECT_EQ(9500, atomic64_read(&self->homa.pacer->link_idle_time)); +} +TEST_F(homa_pacer, homa_pacer_check_nic_q__queue_empty) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 500, 1000); + + homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; + unit_log_clear(); + atomic64_set(&self->homa.pacer->link_idle_time, 9000); + mock_clock = 10000; + self->homa.qshared->max_nic_est_backlog_cycles = 1000; + EXPECT_EQ(1, homa_pacer_check_nic_q(self->homa.pacer, + crpc->msgout.packets, true)); + EXPECT_EQ(10500, atomic64_read(&self->homa.pacer->link_idle_time)); +} + +TEST_F(homa_pacer, homa_pacer_main__exit) +{ + unit_hook_register(exit_hook); + hook_exit_count = 10; + homa_pacer_main(self->homa.pacer); + EXPECT_EQ(0, hook_exit_count); +} +TEST_F(homa_pacer, homa_pacer_main__xmit_data) +{ + struct homa_rpc *crpc1, *crpc2; + + crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 5000, 1000); + crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id+2, 10000, 1000); + + homa_pacer_manage_rpc(crpc1); + homa_pacer_manage_rpc(crpc2); + self->homa.qshared->max_nic_est_backlog_cycles = 3000; + mock_clock_tick = 200; + unit_hook_register(exit_idle_hook); + hook_pacer = self->homa.pacer; + unit_log_clear(); + homa_pacer_main(self->homa.pacer); + EXPECT_STREQ("xmit DATA 1400@0; " + "xmit DATA 1400@1400; " + "xmit DATA 1400@2800; time 1600; time 2200; " + "xmit DATA 800@4200; " + "removing id 1234 from throttled list; time 3200; " + "xmit DATA 1400@0; time 4400; " + "xmit DATA 1400@1400; time 5600; time 6200; " + "xmit DATA 1400@2800; time 7400; " + "xmit DATA 1400@4200; time 8600; time 9200; " + "xmit DATA 1400@5600; time 10400; time 11000; " + "xmit DATA 1400@7000; time 12200; " + "xmit DATA 1400@8400; time 13400; time 14000; " + "xmit DATA 200@9800; " + "removing id 1236 from throttled list", + unit_log_get()); +} +TEST_F(homa_pacer, homa_pacer_main__rpc_arrives_while_sleeping) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, + 3000, 1000); + + unit_hook_register(exit_hook); + hook_exit_count = 5; + mock_clock_tick = 200; + unit_hook_register(manage_hook); + hook_rpc = crpc; + self->homa.qshared->max_nic_est_backlog_cycles = 2000; + + unit_log_clear(); + homa_pacer_main(self->homa.pacer); + EXPECT_STREQ("xmit DATA 1400@0; " + "xmit DATA 1400@1400; " + "xmit DATA 200@2800; " + "removing id 1234 from throttled list", + unit_log_get()); +} +TEST_F(homa_pacer, homa_pacer_main__exit_on_signal) +{ + mock_prepare_to_wait_errors = 1; + mock_prepare_to_wait_status = -EINVAL; + unit_log_clear(); + homa_pacer_main(self->homa.pacer); +} + +TEST_F(homa_pacer, homa_pacer_xmit__basics) +{ + struct homa_rpc *crpc1, *crpc2, *crpc3; + + crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 5000, 1000); + crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id+2, 10000, 1000); + crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id+4, 150000, 1000); + + homa_pacer_manage_rpc(crpc1); + homa_pacer_manage_rpc(crpc2); + homa_pacer_manage_rpc(crpc3); + self->homa.qshared->max_nic_est_backlog_cycles = 2000; + unit_log_clear(); + homa_pacer_xmit(self->homa.pacer); + EXPECT_STREQ("xmit DATA 1400@0; xmit DATA 1400@1400", + unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("request id 1234, next_offset 2800; " + "request id 1236, next_offset 0; " + "request id 1238, next_offset 0", unit_log_get()); +} +TEST_F(homa_pacer, homa_pacer_xmit__pacer_already_active) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 10000, 1000); + + homa_pacer_manage_rpc(crpc); + self->homa.qshared->max_nic_est_backlog_cycles = 2000; + mock_trylock_errors = 1; + unit_log_clear(); + homa_pacer_xmit(self->homa.pacer); + EXPECT_STREQ("", unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("request id 1234, next_offset 0", unit_log_get()); +} +TEST_F(homa_pacer, homa_pacer_xmit__nic_queue_fills) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 10000, 1000); + + homa_pacer_manage_rpc(crpc); + self->homa.qshared->max_nic_est_backlog_cycles = 2001; + mock_clock = 10000; + atomic64_set(&self->homa.pacer->link_idle_time, 12000); + unit_log_clear(); + homa_pacer_xmit(self->homa.pacer); + + /* Just room for one packet before NIC queue fills. */ + EXPECT_STREQ("xmit DATA 1400@0", unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("request id 1234, next_offset 1400", unit_log_get()); +} +TEST_F(homa_pacer, homa_pacer_xmit__queue_empty) +{ + self->homa.qshared->max_nic_est_backlog_cycles = 2000; + unit_log_clear(); + homa_pacer_xmit(self->homa.pacer); + unit_log_throttled(&self->homa); + EXPECT_STREQ("", unit_log_get()); +} +TEST_F(homa_pacer, homa_pacer_xmit__xmit_fifo) +{ + struct homa_rpc *crpc1, *crpc2, *crpc3; + + mock_clock = 10000; + crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 2, + 20000, 1000); + mock_clock = 11000; + crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 4, + 10000, 1000); + mock_clock = 12000; + crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 6, + 30000, 1000); + homa_pacer_manage_rpc(crpc1); + homa_pacer_manage_rpc(crpc2); + homa_pacer_manage_rpc(crpc3); + + /* First attempt: pacer->fifo_count doesn't reach zero. */ + self->homa.qshared->max_nic_est_backlog_cycles = 1300; + self->homa.pacer->fifo_count = 200; + self->homa.qshared->fifo_fraction = 150; + mock_clock= 13000; + atomic64_set(&self->homa.pacer->link_idle_time, 10000); + unit_log_clear(); + mock_xmit_log_verbose = 1; + homa_pacer_xmit(self->homa.pacer); + EXPECT_SUBSTR("id 4, message_length 10000, offset 0, data_length 1400", + unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("request id 4, next_offset 1400; " + "request id 2, next_offset 0; " + "request id 6, next_offset 0", unit_log_get()); + EXPECT_EQ(50, self->homa.pacer->fifo_count); + + /* Second attempt: pacer->fifo_count reaches zero. */ + atomic64_set(&self->homa.pacer->link_idle_time, 10000); + unit_log_clear(); + homa_pacer_xmit(self->homa.pacer); + EXPECT_SUBSTR("id 2, message_length 20000, offset 0, data_length 1400", + unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("request id 4, next_offset 1400; " + "request id 2, next_offset 1400; " + "request id 6, next_offset 0", unit_log_get()); + EXPECT_EQ(900, self->homa.pacer->fifo_count); +} +TEST_F(homa_pacer, homa_pacer_xmit__rpc_removed_from_queue_before_locked) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 10000, 1000); + + homa_pacer_manage_rpc(crpc); + self->homa.qshared->max_nic_est_backlog_cycles = 10000; + unit_log_clear(); + unit_hook_register(unmanage_hook); + hook_rpc = crpc; + hook_count = 2; + homa_pacer_xmit(self->homa.pacer); + + EXPECT_STREQ("removing id 1234 from throttled list", unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("", unit_log_get()); +} +TEST_F(homa_pacer, homa_pacer_xmit__remove_from_queue) +{ + struct homa_rpc *crpc1, *crpc2; + + crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 2, + 1000, 1000); + crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 4, + 2000, 1000); + + homa_pacer_manage_rpc(crpc1); + homa_pacer_manage_rpc(crpc2); + self->homa.qshared->max_nic_est_backlog_cycles = 2000; + unit_log_clear(); + + /* First call completes id 2, but id 4 is still in the queue. */ + homa_pacer_xmit(self->homa.pacer); + EXPECT_STREQ("xmit DATA 1000@0; removing id 2 from throttled list; " + "xmit DATA 1400@0", unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("request id 4, next_offset 1400", unit_log_get()); + EXPECT_TRUE(list_empty(&crpc1->throttled_links)); + + /* Second call completes id 4, queue now empty. */ + unit_log_clear(); + self->homa.qshared->max_nic_est_backlog_cycles = 10000; + homa_pacer_xmit(self->homa.pacer); + EXPECT_STREQ("xmit DATA 600@1400; removing id 4 from throttled list", + unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("", unit_log_get()); + EXPECT_TRUE(list_empty(&crpc2->throttled_links)); +} + +TEST_F(homa_pacer, homa_pacer_manage_rpc__basics) +{ + struct homa_rpc *crpc1, *crpc2, *crpc3, *crpc4, *crpc5; + + crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 2, 10000, + 1000); + crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 4, 5000, + 1000); + crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 6, 15000, + 1000); + crpc4 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 8, 12000, + 1000); + crpc5 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 10, 10000, + 1000); + + /* Basics: add one RPC. */ + mock_log_wakeups = 1; + unit_log_clear(); + homa_pacer_manage_rpc(crpc1); + EXPECT_STREQ("wake_up", unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("request id 2, next_offset 0", unit_log_get()); + + /* Check priority ordering. */ + homa_pacer_manage_rpc(crpc2); + homa_pacer_manage_rpc(crpc3); + homa_pacer_manage_rpc(crpc4); + homa_pacer_manage_rpc(crpc5); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("request id 4, next_offset 0; " + "request id 2, next_offset 0; " + "request id 10, next_offset 0; " + "request id 8, next_offset 0; " + "request id 6, next_offset 0", unit_log_get()); + + /* Don't reinsert if already present. */ + unit_log_clear(); + homa_pacer_manage_rpc(crpc1); + EXPECT_STREQ("", unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("request id 4, next_offset 0; " + "request id 2, next_offset 0; " + "request id 10, next_offset 0; " + "request id 8, next_offset 0; " + "request id 6, next_offset 0", unit_log_get()); +} +TEST_F(homa_pacer, homa_pacer_manage_rpc__inc_metrics) +{ + struct homa_rpc *crpc1, *crpc2, *crpc3; + + crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 5000, 1000); + crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id+2, 10000, 1000); + crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id+4, 15000, 1000); + + homa_pacer_manage_rpc(crpc1); + EXPECT_EQ(1, homa_metrics_per_cpu()->throttle_list_adds); + EXPECT_EQ(0, homa_metrics_per_cpu()->throttle_list_checks); + + homa_pacer_manage_rpc(crpc2); + EXPECT_EQ(2, homa_metrics_per_cpu()->throttle_list_adds); + EXPECT_EQ(1, homa_metrics_per_cpu()->throttle_list_checks); + + homa_pacer_manage_rpc(crpc3); + EXPECT_EQ(3, homa_metrics_per_cpu()->throttle_list_adds); + EXPECT_EQ(3, homa_metrics_per_cpu()->throttle_list_checks); +} + +TEST_F(homa_pacer, homa_pacer_unmanage_rpc__basics) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 5000, 1000); + + homa_pacer_manage_rpc(crpc); + EXPECT_FALSE(list_empty(&self->homa.pacer->throttled_rpcs)); + + // First attempt will remove. + unit_log_clear(); + homa_pacer_unmanage_rpc(crpc); + EXPECT_TRUE(list_empty(&self->homa.pacer->throttled_rpcs)); + EXPECT_STREQ("removing id 1234 from throttled list", unit_log_get()); + + // Second attempt: nothing to do. + unit_log_clear(); + homa_pacer_unmanage_rpc(crpc); + EXPECT_TRUE(list_empty(&self->homa.pacer->throttled_rpcs)); + EXPECT_STREQ("", unit_log_get()); +} +TEST_F(homa_pacer, homa_pacer_unmanage_rpc__metrics) +{ + struct homa_rpc *crpc1, *crpc2; + + crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 5000, 1000); + crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id+2, 5000, 1000); + + mock_clock = 1000; + homa_pacer_manage_rpc(crpc1); + EXPECT_EQ(1000, self->homa.pacer->throttle_add); + EXPECT_EQ(0, homa_metrics_per_cpu()->nic_backlog_cycles); + + mock_clock = 3000; + homa_pacer_manage_rpc(crpc2); + EXPECT_EQ(3000, self->homa.pacer->throttle_add); + EXPECT_EQ(2000, homa_metrics_per_cpu()->nic_backlog_cycles); + + mock_clock = 7000; + homa_pacer_unmanage_rpc(crpc1); + EXPECT_EQ(3000, self->homa.pacer->throttle_add); + EXPECT_EQ(2000, homa_metrics_per_cpu()->nic_backlog_cycles); + + mock_clock = 8000; + homa_pacer_unmanage_rpc(crpc2); + EXPECT_EQ(3000, self->homa.pacer->throttle_add); + EXPECT_EQ(7000, homa_metrics_per_cpu()->nic_backlog_cycles); +} + +TEST_F(homa_pacer, homa_pacer_update_sysctl_deps) +{ + self->homa.link_mbps = 10000; + homa_pacer_update_sysctl_deps(self->homa.pacer); + EXPECT_EQ(808000, self->homa.pacer->cycles_per_mbyte); + + self->homa.link_mbps = 1000; + homa_pacer_update_sysctl_deps(self->homa.pacer); + EXPECT_EQ(8080000, self->homa.pacer->cycles_per_mbyte); + + self->homa.link_mbps = 40000; + homa_pacer_update_sysctl_deps(self->homa.pacer); + EXPECT_EQ(202000, self->homa.pacer->cycles_per_mbyte); +} \ No newline at end of file diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c new file mode 100644 index 00000000..da09c673 --- /dev/null +++ b/test/unit_homa_peer.c @@ -0,0 +1,864 @@ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +#include "homa_impl.h" +#include "homa_peer.h" +#include "homa_rpc.h" +#define KSELFTEST_NOT_MAIN 1 +#include "kselftest_harness.h" +#include "ccutils.h" +#include "mock.h" +#include "utils.h" + +struct in6_addr ip1111[1]; +struct in6_addr ip2222[1]; +struct in6_addr ip3333[1]; +struct in6_addr ip4444[1]; +struct in6_addr ip5555[1]; +struct in6_addr ip6666[1]; + +FIXTURE(homa_peer) { + struct homa homa; + struct homa_net *hnet; + struct homa_sock hsk; + struct in6_addr client_ip[1]; + struct in6_addr server_ip[1]; + int server_port; +}; +FIXTURE_SETUP(homa_peer) +{ + homa_init(&self->homa); + self->hnet = mock_hnet(0, &self->homa); + mock_sock_init(&self->hsk, self->hnet, 0); + self->client_ip[0] = unit_get_in_addr("196.168.0.1"); + self->server_ip[0] = unit_get_in_addr("1.2.3.4"); + ip1111[0] = unit_get_in_addr("1::1:1:1"); + ip2222[0] = unit_get_in_addr("2::2:2:2"); + ip3333[0] = unit_get_in_addr("3::3:3:3"); + ip4444[0] = unit_get_in_addr("4::4:4:4"); + ip5555[0] = unit_get_in_addr("5::5:5:5"); + ip6666[0] = unit_get_in_addr("6::6:6:6"); + self->server_port = 99; +} +FIXTURE_TEARDOWN(homa_peer) +{ + homa_destroy(&self->homa); + unit_teardown(); +} + +#ifndef __STRIP__ /* See strip.py */ +static void peer_spinlock_hook(char *id) +{ + if (strcmp(id, "spin_lock") != 0) + return; + mock_clock += 1000; +} +#endif /* See strip.py */ + +static struct _test_data_homa_peer *test_data; +static struct homa_peer *conflicting_peer; +static int peer_race_hook_invocations; +static void peer_race_hook(char *id) +{ + if (strcmp(id, "kmalloc") != 0) + return; + if (peer_race_hook_invocations > 0) + return; + peer_race_hook_invocations++; + + /* Create a peer with the same address as the one being created + * by the current test. + */ + conflicting_peer = homa_peer_get(&test_data->hsk, ip3333); + homa_peer_release(conflicting_peer); + jiffies += 10; +} + +static struct homa_peertab *hook_peertab; +static void stop_gc_hook(char *id) +{ + if (strcmp(id, "kfree") != 0) + return; + unit_log_printf("; ", "gc_stop_count %d", hook_peertab->gc_stop_count); +} + +TEST_F(homa_peer, homa_peer_alloc_peertab__success) +{ + struct homa_peertab *peertab; + + peertab = homa_peer_alloc_peertab(); + EXPECT_FALSE(IS_ERR(peertab)); + + homa_peer_free_peertab(peertab); +} +TEST_F(homa_peer, homa_peer_alloc_peertab__cant_alloc_peertab) +{ + struct homa_peertab *peertab; + + mock_kmalloc_errors = 1; + peertab = homa_peer_alloc_peertab(); + EXPECT_TRUE(IS_ERR(peertab)); + EXPECT_EQ(ENOMEM, -PTR_ERR(peertab)); +} +TEST_F(homa_peer, homa_peer_alloc_peertab__rhashtable_init_fails) +{ + struct homa_peertab *peertab; + + mock_rht_init_errors = 1; + peertab = homa_peer_alloc_peertab(); + EXPECT_TRUE(IS_ERR(peertab)); + EXPECT_EQ(EINVAL, -PTR_ERR(peertab)); +} +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_peer, homa_peer_alloc_peertab__cant_register_sysctl) +{ + struct homa_peertab *peertab; + + mock_register_sysctl_errors = 1; + peertab = homa_peer_alloc_peertab(); + EXPECT_TRUE(IS_ERR(peertab)); + EXPECT_EQ(ENOMEM, -PTR_ERR(peertab)); + EXPECT_SUBSTR("couldn't register sysctl parameters for Homa peertab", + mock_printk_output); +} +#endif /* See strip.py */ + +TEST_F(homa_peer, homa_peer_free_net__basics) +{ + /* Create peers from two different netns's, make sure only + * those from one get freed. */ + struct homa_peer *peer; + struct homa_sock hsk2; + struct homa_net *hnet2; + + hnet2 = mock_hnet(1, &self->homa); + mock_sock_init(&hsk2, hnet2, 44); + + peer = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peer); + peer = homa_peer_get(&self->hsk, ip2222); + homa_peer_release(peer); + peer = homa_peer_get(&hsk2, ip3333); + homa_peer_release(peer); + EXPECT_EQ(3, unit_count_peers(&self->homa)); + EXPECT_EQ(3, self->homa.peertab->num_peers); + EXPECT_EQ(2, self->hnet->num_peers); + + homa_peer_free_net(self->hnet); + EXPECT_EQ(1, unit_count_peers(&self->homa)); + EXPECT_EQ(1, self->homa.peertab->num_peers); +} +TEST_F(homa_peer, homa_peer_free_net__set_gc_stop_count) +{ + struct homa_peer *peer; + + peer = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peer); + + unit_hook_register(stop_gc_hook); + hook_peertab = self->homa.peertab; + unit_log_clear(); + self->homa.peertab->gc_stop_count = 3; + + homa_peer_free_net(self->hnet); + EXPECT_EQ(0, unit_count_peers(&self->homa)); + EXPECT_SUBSTR("gc_stop_count 4", unit_log_get()); + EXPECT_EQ(3, self->homa.peertab->gc_stop_count); +} + +TEST_F(homa_peer, homa_peer_release_fn) +{ + struct homa_peer *peer; + struct dst_entry *dst; + + peer = homa_peer_alloc(&self->hsk, ip3333); + dst = peer->dst; + dst_hold(dst); + EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); + homa_peer_release(peer); + + homa_peer_release_fn(peer, NULL); + EXPECT_EQ(1, atomic_read(&dst->__rcuref.refcnt)); + dst_release(dst); +} + +TEST_F(homa_peer, homa_peer_free_peertab) { + struct homa_peer *peer; + + /* Create two peers, release one before destroying the table, the + * other after (test infrastructure will detect improper freeing). + */ + peer = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peer); + peer = homa_peer_get(&self->hsk, ip2222); + + unit_log_clear(); + homa_peer_free_peertab(self->homa.peertab); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_SUBSTR("unregister_net_sysctl_table", unit_log_get()); +#endif /* See strip.py */ + + homa_peer_release(peer); + self->homa.peertab = homa_peer_alloc_peertab(); +} + +TEST_F(homa_peer, homa_peer_prefer_evict) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peer1, *peer2, *peer3, *peer4; + struct homa_net *hnet2; + struct homa_sock hsk2; + + hnet2 = mock_hnet(1, &self->homa); + mock_sock_init(&hsk2, hnet2, 44); + + peer1 = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peer1); + peer1->access_jiffies = 100; + + peer2 = homa_peer_get(&self->hsk, ip2222); + homa_peer_release(peer2); + peer2->access_jiffies = 1000; + + peer3 = homa_peer_get(&hsk2, ip3333); + homa_peer_release(peer3); + peer3->access_jiffies = 500; + + peer4 = homa_peer_get(&hsk2, ip1111); + homa_peer_release(peer4); + peer4->access_jiffies = 300; + hnet2->num_peers = peertab->net_max + 1; + + EXPECT_EQ(1, homa_peer_prefer_evict(peertab, peer3, peer1)); + EXPECT_EQ(0, homa_peer_prefer_evict(peertab, peer3, peer4)); + EXPECT_EQ(0, homa_peer_prefer_evict(peertab, peer1, peer4)); + EXPECT_EQ(1, homa_peer_prefer_evict(peertab, peer1, peer2)); + + unit_sock_destroy(&hsk2); + homa_peer_free_net(hnet2); +} + +TEST_F(homa_peer, homa_peer_pick_victims__hash_table_wraparound) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peers[3], *victims[5]; + + jiffies = 50; + peers[0] = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peers[0]); + + peers[1] = NULL; + + peers[2] = homa_peer_get(&self->hsk, ip2222); + homa_peer_release(peers[2]); + + mock_rht_walk_results = (void **)peers; + mock_rht_num_walk_results = 3; + jiffies = peertab->idle_jiffies_max + 100; + + EXPECT_EQ(2, homa_peer_pick_victims(peertab, victims, 5)); + EXPECT_EQ(peers[0], victims[0]); + EXPECT_EQ(peers[2], victims[1]); +} +TEST_F(homa_peer, homa_peer_pick_victims__EAGAIN_from_rht_walk) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peers[5], *victims[5]; + + jiffies = 50; + peers[0] = ERR_PTR(-EAGAIN); + + peers[1] = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peers[1]); + + peers[2] = ERR_PTR(-EAGAIN); + + peers[3] = ERR_PTR(-EAGAIN); + + peers[4] = homa_peer_get(&self->hsk, ip2222); + homa_peer_release(peers[4]); + + mock_rht_walk_results = (void **)peers; + mock_rht_num_walk_results = 5; + jiffies = peertab->idle_jiffies_max + 100; + + EXPECT_EQ(1, homa_peer_pick_victims(peertab, victims, 5)); + EXPECT_EQ(peers[1], victims[0]); +} +TEST_F(homa_peer, homa_peer_pick_victims__filter_idle_jiffies_min) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peers[2], *victims[5]; + + jiffies = 100; + peers[1] = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peers[1]); + + jiffies = 200; + peers[0] = homa_peer_get(&self->hsk, ip2222); + homa_peer_release(peers[0]); + + mock_rht_walk_results = (void **)peers; + mock_rht_num_walk_results = 2; + jiffies = peertab->idle_jiffies_min + 150; + self->hnet->num_peers = peertab->net_max + 1000; + + EXPECT_EQ(1, homa_peer_pick_victims(peertab, victims, 5)); + EXPECT_EQ(peers[1], victims[0]); +} +TEST_F(homa_peer, homa_peer_pick_victims__filter_idle_jiffies_max) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peers[3], *victims[5]; + struct homa_net *hnet2; + struct homa_sock hsk2; + + hnet2 = mock_hnet(1, &self->homa); + mock_sock_init(&hsk2, hnet2, 44); + hnet2->num_peers = peertab->net_max + 1; + + /* First peer: net below limit, idle < max. */ + jiffies = 150; + peers[0] = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peers[0]); + + /* Second peer: net above limit, idle > max. */ + jiffies = 50; + peers[1] = homa_peer_get(&hsk2, ip2222); + homa_peer_release(peers[1]); + + /* Third peer: net below limit, idle > max. */ + jiffies = 50; + peers[2] = homa_peer_get(&self->hsk, ip3333); + homa_peer_release(peers[2]); + + mock_rht_walk_results = (void **)peers; + mock_rht_num_walk_results = 3; + jiffies = peertab->idle_jiffies_max + 100; + + EXPECT_EQ(2, homa_peer_pick_victims(peertab, victims, 5)); + EXPECT_EQ(peers[1], victims[0]); + EXPECT_EQ(peers[2], victims[1]); +} +TEST_F(homa_peer, homa_peer_pick_victims__duplicate_peer) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peers[3], *victims[3]; + + jiffies = 300; + peers[0] = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peers[0]); + + peers[1] = peers[0]; + peers[2] = peers[0]; + + mock_rht_walk_results = (void **)peers; + mock_rht_num_walk_results = 3; + jiffies = peertab->idle_jiffies_max + 1000; + + EXPECT_EQ(1, homa_peer_pick_victims(peertab, victims, 3)); + EXPECT_EQ(peers[0], victims[0]); +} +TEST_F(homa_peer, homa_peer_pick_victims__select_best_candidates) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peers[6], *victims[3]; + + jiffies = 300; + peers[0] = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peers[0]); + + jiffies = 400; + peers[1] = homa_peer_get(&self->hsk, ip2222); + homa_peer_release(peers[1]); + + jiffies = 500; + peers[2] = homa_peer_get(&self->hsk, ip3333); + homa_peer_release(peers[2]); + + jiffies = 200; + peers[3] = homa_peer_get(&self->hsk, ip4444); + homa_peer_release(peers[3]); + + jiffies = 350; + peers[4] = homa_peer_get(&self->hsk, ip5555); + homa_peer_release(peers[4]); + + jiffies = 600; + peers[5] = homa_peer_get(&self->hsk, ip6666); + homa_peer_release(peers[5]); + + mock_rht_walk_results = (void **)peers; + mock_rht_num_walk_results = 6; + jiffies = peertab->idle_jiffies_max + 1000; + + EXPECT_EQ(3, homa_peer_pick_victims(peertab, victims, 3)); + EXPECT_EQ(peers[3], victims[0]); + EXPECT_EQ(peers[0], victims[1]); + EXPECT_EQ(peers[4], victims[2]); +} + +TEST_F(homa_peer, homa_peer_gc__basics) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peer; + + jiffies = 300; + peer = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peer); + EXPECT_EQ(1, self->hnet->num_peers); + + jiffies = peertab->idle_jiffies_max + 1000; + peertab->num_peers = peertab->gc_threshold; + + unit_log_clear(); + homa_peer_gc(peertab); + EXPECT_STREQ("call_rcu invoked", unit_log_get()); + EXPECT_EQ(0, self->hnet->num_peers); + EXPECT_EQ(peertab->gc_threshold - 1, peertab->num_peers); +} +TEST_F(homa_peer, homa_peer_gc__gc_stop_count) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peer; + + jiffies = 300; + peer = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peer); + EXPECT_EQ(1, self->hnet->num_peers); + + jiffies = peertab->idle_jiffies_max + 1000; + peertab->num_peers = peertab->gc_threshold; + peertab->gc_stop_count = 1; + + unit_log_clear(); + homa_peer_gc(peertab); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(1, self->hnet->num_peers); +} +TEST_F(homa_peer, homa_peer_gc__peers_below_gc_threshold) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peer; + + jiffies = 300; + peer = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peer); + + jiffies = peertab->idle_jiffies_max + 1000; + peertab->num_peers = peertab->gc_threshold - 1; + + unit_log_clear(); + homa_peer_gc(peertab); + EXPECT_STREQ("", unit_log_get()); +} +TEST_F(homa_peer, homa_peer_gc__no_suitable_candidates) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peer; + + jiffies = 100; + peer = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peer); + + jiffies = peertab->idle_jiffies_min; + peertab->num_peers = peertab->gc_threshold - 1; + + unit_log_clear(); + homa_peer_gc(peertab); + EXPECT_STREQ("", unit_log_get()); +} + +TEST_F(homa_peer, homa_peer_alloc__success) +{ + struct homa_peer *peer; + + jiffies = 999; + peer = homa_peer_alloc(&self->hsk, ip1111); + ASSERT_FALSE(IS_ERR(peer)); + EXPECT_EQ_IP(*ip1111, peer->addr); + EXPECT_EQ(999, peer->access_jiffies); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(INT_MAX, peer->unsched_cutoffs[HOMA_MAX_PRIORITIES-2]); + EXPECT_EQ(0, peer->cutoff_version); + EXPECT_EQ(1, homa_metrics_per_cpu()->peer_allocs); +#endif /* See strip.py */ + EXPECT_EQ(1, atomic_read(&peer->dst->__rcuref.refcnt)); + homa_peer_release(peer); +} +TEST_F(homa_peer, homa_peer_alloc__kmalloc_error) +{ + struct homa_peer *peer; + + mock_kmalloc_errors = 1; + peer = homa_peer_alloc(&self->hsk, ip3333); + EXPECT_EQ(ENOMEM, -PTR_ERR(peer)); + +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(1, homa_metrics_per_cpu()->peer_kmalloc_errors); +#endif /* See strip.py */ +} +TEST_F(homa_peer, homa_peer_alloc__route_error) +{ + struct homa_peer *peer; + + mock_route_errors = 1; + peer = homa_peer_alloc(&self->hsk, ip3333); + EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(peer)); + +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); + EXPECT_EQ(0, homa_metrics_per_cpu()->peer_allocs); +#endif /* See strip.py */ +} + +TEST_F(homa_peer, homa_peer_free) +{ + struct homa_peer *peer; + struct dst_entry *dst; + + peer = homa_peer_alloc(&self->hsk, ip1111); + ASSERT_FALSE(IS_ERR(peer)); + dst = peer->dst; + dst_hold(dst); + ASSERT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); + + homa_peer_release(peer); + ASSERT_EQ(1, atomic_read(&dst->__rcuref.refcnt)); + dst_release(dst); +} + +TEST_F(homa_peer, homa_peer_get__basics) +{ + struct homa_peer *peer, *peer2; + + /* First call: create new peer. */ + jiffies = 456; + peer = homa_peer_get(&self->hsk, ip1111); + ASSERT_FALSE(IS_ERR(peer)); + EXPECT_EQ_IP(*ip1111, peer->addr); + EXPECT_EQ(456, peer->access_jiffies); + EXPECT_EQ(2, refcount_read(&peer->refs)); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(INT_MAX, peer->unsched_cutoffs[HOMA_MAX_PRIORITIES-2]); + EXPECT_EQ(0, peer->cutoff_version); +#endif /* See strip.py */ + EXPECT_EQ(1, self->homa.peertab->num_peers); + EXPECT_EQ(1, self->hnet->num_peers); + + /* Second call: lookup existing peer. */ + peer2 = homa_peer_get(&self->hsk, ip1111); + EXPECT_EQ(peer, peer2); + EXPECT_EQ(3, refcount_read(&peer->refs)); + EXPECT_EQ(1, self->homa.peertab->num_peers); + EXPECT_EQ(1, self->hnet->num_peers); + + /* Third call: lookup new peer. */ + peer2 = homa_peer_get(&self->hsk, ip2222); + EXPECT_NE(peer, peer2); + ASSERT_FALSE(IS_ERR(peer2)); + EXPECT_EQ(2, refcount_read(&peer2->refs)); + EXPECT_EQ(2, self->homa.peertab->num_peers); + EXPECT_EQ(2, self->hnet->num_peers); + +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(2, homa_metrics_per_cpu()->peer_allocs); +#endif /* See strip.py */ + homa_peer_release(peer); + homa_peer_release(peer); + homa_peer_release(peer2); +} +TEST_F(homa_peer, homa_peer_get__error_in_homa_peer_alloc) +{ + struct homa_peer *peer; + + mock_route_errors = 1; + peer = homa_peer_get(&self->hsk, ip3333); + EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(peer)); + +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); +#endif /* See strip.py */ +} +TEST_F(homa_peer, homa_peer_get__insert_error) +{ + struct homa_peer *peer; + + mock_rht_insert_errors = 1; + peer = homa_peer_get(&self->hsk, ip3333); + EXPECT_TRUE(IS_ERR(peer)); + EXPECT_EQ(EINVAL, -PTR_ERR(peer)); +} +TEST_F(homa_peer, homa_peer_get__conflicting_create) +{ + struct homa_peer *peer; + + test_data = self; + peer_race_hook_invocations = 0; + unit_hook_register(peer_race_hook); + jiffies = 100; + peer = homa_peer_get(&self->hsk, ip3333); + EXPECT_FALSE(IS_ERR(conflicting_peer)); + EXPECT_EQ(conflicting_peer, peer); + EXPECT_EQ(2, refcount_read(&peer->refs)); + EXPECT_EQ(110, peer->access_jiffies); + homa_peer_release(peer); + EXPECT_EQ(1, self->homa.peertab->num_peers); + EXPECT_EQ(1, self->hnet->num_peers); +} + +TEST_F(homa_peer, homa_get_dst__normal) +{ + struct homa_peer *peer = homa_peer_get(&self->hsk, &ip1111[0]); + struct dst_entry *dst; + + dst = homa_get_dst(peer, &self->hsk); + EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->peer_dst_refreshes)); + dst_release(dst); + homa_peer_release(peer); +} +TEST_F(homa_peer, homa_get_dst__must_refresh_obsolete) +{ + struct homa_peer *peer = homa_peer_get(&self->hsk, &ip1111[0]); + struct dst_entry *old, *dst; + + old = peer->dst; + peer->dst->obsolete = 1; + mock_dst_check_errors = 1; + dst = homa_get_dst(peer, &self->hsk); + EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->peer_dst_refreshes)); + EXPECT_NE(old, dst); + dst_release(dst); + homa_peer_release(peer); +} +TEST_F(homa_peer, homa_get_dst__multiple_refresh_failures) +{ + struct homa_peer *peer = homa_peer_get(&self->hsk, &ip1111[0]); + struct dst_entry *old, *dst; + + old = peer->dst; + peer->dst->obsolete = 1; + mock_dst_check_errors = 0xf; + mock_route_errors = 0xf; + dst = homa_get_dst(peer, &self->hsk); + EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->peer_dst_refreshes)); + EXPECT_EQ(old, dst); + EXPECT_EQ(3, mock_dst_check_errors); + dst_release(dst); + homa_peer_release(peer); +} + +TEST_F(homa_peer, homa_peer_reset_dst__ipv4) +{ + int status; + + // Make sure the test uses IPv4. + mock_ipv6 = false; + unit_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, self->hnet, 0); + + struct homa_peer *peer = homa_peer_get(&self->hsk, + &self->client_ip[0]); + ASSERT_NE(NULL, peer); + + status = homa_peer_reset_dst(peer, &self->hsk); + ASSERT_EQ(0, -status); + ASSERT_NE(NULL, peer->dst); + EXPECT_STREQ("196.168.0.1", + homa_print_ipv4_addr(peer->flow.u.ip4.daddr)); + homa_peer_release(peer); +} +TEST_F(homa_peer, homa_peer_reset_dst__ipv4_route_error) +{ + struct dst_entry *old; + int status; + + // Make sure the test uses IPv4. + mock_ipv6 = false; + unit_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, self->hnet, 0); + + struct homa_peer *peer = homa_peer_get(&self->hsk, + &self->client_ip[0]); + ASSERT_NE(NULL, peer); + old = peer->dst; + + mock_route_errors = 1; + status = homa_peer_reset_dst(peer, &self->hsk); + EXPECT_EQ(EHOSTUNREACH, -status); + EXPECT_EQ(old, peer->dst); + homa_peer_release(peer); +} +TEST_F(homa_peer, homa_peer_reset_dst__ipv6) +{ + char buffer[30]; + int status; + u32 addr; + + // Make sure the test uses IPv6. + mock_ipv6 = true; + unit_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, self->hnet, 0); + + struct homa_peer *peer = homa_peer_get(&self->hsk, &ip1111[0]); + ASSERT_NE(NULL, peer); + + status = homa_peer_reset_dst(peer, &self->hsk); + ASSERT_EQ(0, -status); + addr = ntohl(peer->flow.u.ip4.daddr); + snprintf(buffer, sizeof(buffer), "%u.%u.%u.%u", (addr >> 24) & 0xff, + (addr >> 16) & 0xff, (addr >> 8) & 0xff, addr & 0xff); + EXPECT_STREQ("[1::1:1:1]", + homa_print_ipv6_addr(&peer->flow.u.ip6.daddr)); + homa_peer_release(peer); +} +TEST_F(homa_peer, homa_peer_reset_dst__ipv6_route_error) +{ + struct dst_entry *old; + int status; + + // Make sure the test uses IPv6. + mock_ipv6 = true; + unit_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, self->hnet, 0); + + struct homa_peer *peer = homa_peer_get(&self->hsk, &ip1111[0]); + ASSERT_NE(NULL, peer); + old = peer->dst; + + mock_route_errors = 1; + status = homa_peer_reset_dst(peer, &self->hsk); + EXPECT_EQ(EHOSTUNREACH, -status); + EXPECT_EQ(old, peer->dst); + homa_peer_release(peer); +} + +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_peer, homa_unsched_priority) +{ + struct homa_peer peer; + + homa_peer_set_cutoffs(&peer, INT_MAX, 0, 0, INT_MAX, 200, 100, 0, 0); + + EXPECT_EQ(5, homa_unsched_priority(&self->homa, &peer, 10)); + EXPECT_EQ(4, homa_unsched_priority(&self->homa, &peer, 200)); + EXPECT_EQ(3, homa_unsched_priority(&self->homa, &peer, 201)); +} + +TEST_F(homa_peer, homa_peer_lock_slow) +{ + struct homa_peer *peer = homa_peer_get(&self->hsk, ip3333); + + ASSERT_NE(NULL, peer); + mock_clock = 10000; + homa_peer_lock(peer); + EXPECT_EQ(0, homa_metrics_per_cpu()->peer_ack_lock_misses); + EXPECT_EQ(0, homa_metrics_per_cpu()->peer_ack_lock_miss_cycles); + homa_peer_unlock(peer); + + mock_trylock_errors = 1; + unit_hook_register(peer_spinlock_hook); + homa_peer_lock(peer); + EXPECT_EQ(1, homa_metrics_per_cpu()->peer_ack_lock_misses); + EXPECT_EQ(1000, homa_metrics_per_cpu()->peer_ack_lock_miss_cycles); + homa_peer_unlock(peer); + homa_peer_release(peer); +} +#endif /* See strip.py */ + +TEST_F(homa_peer, homa_peer_add_ack) +{ + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->server_port, + 101, 100, 100); + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->server_port, + 102, 100, 100); + struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->server_port, + 103, 100, 100); + struct homa_peer *peer = crpc1->peer; + + EXPECT_EQ(0, peer->num_acks); + + /* Initialize 3 acks in the peer. */ + peer->acks[0] = (struct homa_ack) { + .server_port = htons(self->server_port), + .client_id = cpu_to_be64(90)}; + peer->acks[1] = (struct homa_ack) { + .server_port = htons(self->server_port), + .client_id = cpu_to_be64(91)}; + peer->acks[2] = (struct homa_ack) { + .server_port = htons(self->server_port), + .client_id = cpu_to_be64(92)}; + peer->num_acks = 3; + + /* Add one RPC to unacked (fits). */ + homa_peer_add_ack(crpc1); + EXPECT_EQ(4, peer->num_acks); + EXPECT_STREQ("server_port 99, client_id 101", + unit_ack_string(&peer->acks[3])); + + /* Add another RPC to unacked (also fits). */ + homa_peer_add_ack(crpc2); + EXPECT_EQ(5, peer->num_acks); + EXPECT_STREQ("server_port 99, client_id 102", + unit_ack_string(&peer->acks[4])); + + /* Third RPC overflows, triggers ACK transmission. */ + unit_log_clear(); + mock_xmit_log_verbose = 1; + homa_peer_add_ack(crpc3); + EXPECT_EQ(0, peer->num_acks); + EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 103, acks [sp 99, id 90] [sp 99, id 91] [sp 99, id 92] [sp 99, id 101] [sp 99, id 102]", + unit_log_get()); +} + +TEST_F(homa_peer, homa_peer_get_acks) +{ + struct homa_peer *peer = homa_peer_get(&self->hsk, ip3333); + struct homa_ack acks[2]; + + ASSERT_NE(NULL, peer); + EXPECT_EQ(0, peer->num_acks); + + // First call: nothing available. + EXPECT_EQ(0, homa_peer_get_acks(peer, 2, acks)); + + // Second call: retrieve 2 out of 3. + peer->acks[0] = (struct homa_ack) { + .server_port = htons(5000), + .client_id = cpu_to_be64(100)}; + peer->acks[1] = (struct homa_ack) { + .server_port = htons(5001), + .client_id = cpu_to_be64(101)}; + peer->acks[2] = (struct homa_ack) { + .server_port = htons(5002), + .client_id = cpu_to_be64(102)}; + peer->num_acks = 3; + EXPECT_EQ(2, homa_peer_get_acks(peer, 2, acks)); + EXPECT_STREQ("server_port 5001, client_id 101", + unit_ack_string(&acks[0])); + EXPECT_STREQ("server_port 5002, client_id 102", + unit_ack_string(&acks[1])); + EXPECT_EQ(1, peer->num_acks); + + // Third call: retrieve final id. + EXPECT_EQ(1, homa_peer_get_acks(peer, 2, acks)); + EXPECT_STREQ("server_port 5000, client_id 100", + unit_ack_string(&acks[0])); + homa_peer_release(peer); +} + +TEST_F(homa_peer, homa_peer_update_sysctl_deps) +{ + struct homa_peertab *peertab = self->homa.peertab; + + peertab->idle_secs_min = 10; + peertab->idle_secs_max = 100; + homa_peer_update_sysctl_deps(peertab); + EXPECT_EQ(10*HZ, peertab->idle_jiffies_min); + EXPECT_EQ(100*HZ, peertab->idle_jiffies_max); +} diff --git a/test/unit_homa_peertab.c b/test/unit_homa_peertab.c deleted file mode 100644 index d95c4597..00000000 --- a/test/unit_homa_peertab.c +++ /dev/null @@ -1,385 +0,0 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include "homa_impl.h" -#define KSELFTEST_NOT_MAIN 1 -#include "kselftest_harness.h" -#include "ccutils.h" -#include "mock.h" -#include "utils.h" - -struct in6_addr ip1111[1]; -struct in6_addr ip2222[1]; -struct in6_addr ip3333[1]; - -FIXTURE(homa_peertab) { - struct homa homa; - struct homa_sock hsk; - struct homa_peertab peertab; - struct in6_addr client_ip[1]; - struct in6_addr server_ip[1]; - int server_port; -}; -FIXTURE_SETUP(homa_peertab) -{ - homa_init(&self->homa); - mock_sock_init(&self->hsk, &self->homa, 0); - homa_peertab_init(&self->peertab); - self->client_ip[0] = unit_get_in_addr("196.168.0.1"); - self->server_ip[0] = unit_get_in_addr("1.2.3.4"); - ip1111[0] = unit_get_in_addr("1::1:1:1"); - ip2222[0] = unit_get_in_addr("2::2:2:2"); - ip3333[0] = unit_get_in_addr("3::3:3:3"); - self->server_port = 99; -} -FIXTURE_TEARDOWN(homa_peertab) -{ - homa_peertab_destroy(&self->peertab); - homa_destroy(&self->homa); - unit_teardown(); -} - -static int dead_count(struct homa_peertab *peertab) -{ - struct list_head *pos; - int count = 0; - list_for_each(pos, &peertab->dead_dsts) - count++; - return count; -} - -static void peer_spinlock_hook(char *id) -{ - if (strcmp(id, "spin_lock") != 0) - return; - mock_cycles += 1000; -} - -TEST_F(homa_peertab, homa_peer_find__basics) -{ - struct homa_peer *peer, *peer2; - - peer = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); - ASSERT_NE(NULL, peer); - EXPECT_EQ_IP(*ip1111, peer->addr); - EXPECT_EQ(INT_MAX, peer->unsched_cutoffs[HOMA_MAX_PRIORITIES-2]); - EXPECT_EQ(0, peer->cutoff_version); - - peer2 = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); - EXPECT_EQ(peer, peer2); - - peer2 = homa_peer_find(&self->peertab, ip2222, &self->hsk.inet); - EXPECT_NE(peer, peer2); - - EXPECT_EQ(2, homa_cores[cpu_number]->metrics.peer_new_entries); -} - -static struct _test_data_homa_peertab *test_data; -static struct homa_peer *conflicting_peer = NULL; -static int peer_lock_hook_invocations = 0; -static void peer_lock_hook(char *id) { - if (strcmp(id, "spin_lock") != 0) - return; - if (peer_lock_hook_invocations > 0) - return; - peer_lock_hook_invocations ++; - /* Creates a peer with the same address as the one being created - * by the main test function below. */ - conflicting_peer = homa_peer_find(&test_data->peertab, ip3333, - &test_data->hsk.inet); -} - -TEST_F(homa_peertab, homa_peertab_init__vmalloc_failed) -{ - struct homa_peertab table; - mock_vmalloc_errors = 1; - EXPECT_EQ(ENOMEM, -homa_peertab_init(&table)); - - /* Make sure destroy is safe after failed init. */ - homa_peertab_destroy(&table); -} - -TEST_F(homa_peertab, homa_peertab_gc_dsts) -{ - struct homa_peer *peer; - peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); - mock_cycles = 0; - homa_dst_refresh(&self->peertab, peer, &self->hsk); - mock_cycles = 50000000; - homa_dst_refresh(&self->peertab, peer, &self->hsk); - mock_cycles = 100000000; - homa_dst_refresh(&self->peertab, peer, &self->hsk); - EXPECT_EQ(3, dead_count(&self->peertab)); - - homa_peertab_gc_dsts(&self->peertab, 150000000); - EXPECT_EQ(2, dead_count(&self->peertab)); - homa_peertab_gc_dsts(&self->peertab, ~0); - EXPECT_EQ(0, dead_count(&self->peertab)); -} - -TEST_F(homa_peertab, homa_peer_find__conflicting_creates) -{ - struct homa_peer *peer; - - test_data = self; - peer_lock_hook_invocations = 0; - unit_hook_register(peer_lock_hook); - peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); - EXPECT_NE(NULL, conflicting_peer); - EXPECT_EQ(conflicting_peer, peer); -} -TEST_F(homa_peertab, homa_peer_find__kmalloc_error) -{ - struct homa_peer *peer; - - mock_kmalloc_errors = 1; - peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); - EXPECT_EQ(ENOMEM, -PTR_ERR(peer)); - - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.peer_kmalloc_errors); -} -TEST_F(homa_peertab, homa_peer_find__route_error) -{ - struct homa_peer *peer; - - mock_route_errors = 1; - peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); - EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(peer)); - - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.peer_route_errors); -} - -TEST_F(homa_peertab, homa_dst_refresh__basics) -{ - struct homa_peer *peer; - struct dst_entry *old_dst; - peer = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); - ASSERT_NE(NULL, peer); - EXPECT_EQ_IP(*ip1111, peer->addr); - - old_dst = homa_get_dst(peer, &self->hsk); - homa_dst_refresh(&self->homa.peers, peer, &self->hsk); - EXPECT_NE(old_dst, peer->dst); - EXPECT_EQ(1, dead_count(&self->homa.peers)); -} -TEST_F(homa_peertab, homa_dst_refresh__routing_error) -{ - struct homa_peer *peer; - struct dst_entry *old_dst; - peer = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); - ASSERT_NE(NULL, peer); - EXPECT_EQ_IP(*ip1111, peer->addr); - - old_dst = homa_get_dst(peer, &self->hsk); - mock_route_errors = 1; - homa_dst_refresh(&self->homa.peers, peer, &self->hsk); - EXPECT_EQ(old_dst, peer->dst); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.peer_route_errors); - EXPECT_EQ(0, dead_count(&self->homa.peers)); -} -TEST_F(homa_peertab, homa_dst_refresh__malloc_error) -{ - struct homa_peer *peer; - struct dst_entry *old_dst; - peer = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); - ASSERT_NE(NULL, peer); - EXPECT_EQ_IP(*ip1111, peer->addr); - - old_dst = homa_get_dst(peer, &self->hsk); - mock_kmalloc_errors = 1; - homa_dst_refresh(&self->homa.peers, peer, &self->hsk); - EXPECT_NE(old_dst, peer->dst); - EXPECT_EQ(0, dead_count(&self->homa.peers)); -} -TEST_F(homa_peertab, homa_dst_refresh__free_old_dsts) -{ - struct homa_peer *peer; - peer = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); - ASSERT_NE(NULL, peer); - EXPECT_EQ_IP(*ip1111, peer->addr); - - mock_cycles = 0; - homa_dst_refresh(&self->homa.peers, peer, &self->hsk); - homa_dst_refresh(&self->homa.peers, peer, &self->hsk); - EXPECT_EQ(2, dead_count(&self->homa.peers)); - mock_cycles = 500000000; - homa_dst_refresh(&self->homa.peers, peer, &self->hsk); - EXPECT_EQ(1, dead_count(&self->homa.peers)); -} - -TEST_F(homa_peertab, homa_unsched_priority) -{ - struct homa_peer peer; - homa_peer_set_cutoffs(&peer, INT_MAX, 0, 0, INT_MAX, 200, 100, 0, 0); - - EXPECT_EQ(5, homa_unsched_priority(&self->homa, &peer, 10)); - EXPECT_EQ(4, homa_unsched_priority(&self->homa, &peer, 200)); - EXPECT_EQ(3, homa_unsched_priority(&self->homa, &peer, 201)); -} - -TEST_F(homa_peertab, homa_peer_get_dst_ipv4) -{ - struct dst_entry *dst; - - // Make sure the test uses IPv4. - mock_ipv6 = false; - homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); - - struct homa_peer *peer = homa_peer_find(&self->peertab, - &self->client_ip[0], &self->hsk.inet); - ASSERT_NE(NULL, peer); - - dst = homa_peer_get_dst(peer, &self->hsk.inet); - ASSERT_NE(NULL, dst); - dst_release(dst); - EXPECT_STREQ("196.168.0.1", - homa_print_ipv4_addr(peer->flow.u.ip4.daddr)); -} -TEST_F(homa_peertab, homa_peer_get_dst_ipv6) -{ - char buffer[30]; - __u32 addr; - struct dst_entry *dst; - - // Make sure the test uses IPv6. - mock_ipv6 = true; - homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); - - struct homa_peer *peer = homa_peer_find(&self->peertab, &ip1111[0], - &self->hsk.inet); - ASSERT_NE(NULL, peer); - - dst = homa_peer_get_dst(peer, &self->hsk.inet); - ASSERT_NE(NULL, dst); - dst_release(dst); - addr = ntohl(peer->flow.u.ip4.daddr); - snprintf(buffer, sizeof(buffer), "%u.%u.%u.%u", (addr >> 24) & 0xff, - (addr >> 16) & 0xff, (addr >> 8) & 0xff, addr & 0xff); - EXPECT_STREQ("[1::1:1:1]", - homa_print_ipv6_addr(&peer->flow.u.ip6.daddr)); -} - -TEST_F(homa_peertab, homa_peer_lock_slow) -{ - mock_cycles = 10000; - struct homa_peer *peer = homa_peer_find(&self->peertab, ip3333, - &self->hsk.inet); - ASSERT_NE(NULL, peer); - - homa_peer_lock(peer); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.peer_ack_lock_misses); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.peer_ack_lock_miss_cycles); - homa_peer_unlock(peer); - - mock_trylock_errors = 1; - unit_hook_register(peer_spinlock_hook); - homa_peer_lock(peer); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.peer_ack_lock_misses); - EXPECT_EQ(1000, homa_cores[cpu_number]->metrics.peer_ack_lock_miss_cycles); - homa_peer_unlock(peer); -} - -TEST_F(homa_peertab, homa_peer_add_ack) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->server_port, - 101, 100, 100); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->server_port, - 102, 100, 100); - struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->server_port, - 103, 100, 100); - struct homa_peer *peer = crpc1->peer; - EXPECT_EQ(0, peer->num_acks); - - /* Initialize 3 acks in the peer. */ - peer->acks[0] = (struct homa_ack) { - .client_port = htons(1000), - .server_port = htons(self->server_port), - .client_id = cpu_to_be64(90)}; - peer->acks[1] = (struct homa_ack) { - .client_port = htons(1001), - .server_port = htons(self->server_port), - .client_id = cpu_to_be64(91)}; - peer->acks[2] = (struct homa_ack) { - .client_port = htons(1002), - .server_port = htons(self->server_port), - .client_id = cpu_to_be64(92)}; - peer->num_acks = 3; - - /* Add one RPC to unacked (fits). */ - homa_peer_add_ack(crpc1); - EXPECT_EQ(4, peer->num_acks); - EXPECT_STREQ("client_port 32768, server_port 99, client_id 101", - unit_ack_string(&peer->acks[3])); - - /* Add another RPC to unacked (also fits). */ - homa_peer_add_ack(crpc2); - EXPECT_EQ(5, peer->num_acks); - EXPECT_STREQ("client_port 32768, server_port 99, client_id 102", - unit_ack_string(&peer->acks[4])); - - /* Third RPC overflows, triggers ACK transmission. */ - unit_log_clear(); - mock_xmit_log_verbose = 1; - homa_peer_add_ack(crpc3); - EXPECT_EQ(0, peer->num_acks); - EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 103, acks " - "[cp 1000, sp 99, id 90] [cp 1001, sp 99, id 91] " - "[cp 1002, sp 99, id 92] [cp 32768, sp 99, id 101] " - "[cp 32768, sp 99, id 102]", - unit_log_get()); -} - -TEST_F(homa_peertab, homa_peer_get_acks) -{ - struct homa_peer *peer = homa_peer_find(&self->peertab, ip3333, - &self->hsk.inet); - ASSERT_NE(NULL, peer); - EXPECT_EQ(0, peer->num_acks); - - // First call: nothing available. - struct homa_ack acks[2]; - EXPECT_EQ(0, homa_peer_get_acks(peer, 2, acks)); - - // Second call: retrieve 2 out of 3. - peer->acks[0] = (struct homa_ack) { - .client_port = htons(4000), - .server_port = htons(5000), - .client_id = cpu_to_be64(100)}; - peer->acks[1] = (struct homa_ack) { - .client_port = htons(4001), - .server_port = htons(5001), - .client_id = cpu_to_be64(101)}; - peer->acks[2] = (struct homa_ack) { - .client_port = htons(4002), - .server_port = htons(5002), - .client_id = cpu_to_be64(102)}; - peer->num_acks = 3; - EXPECT_EQ(2, homa_peer_get_acks(peer, 2, acks)); - EXPECT_STREQ("client_port 4001, server_port 5001, client_id 101", - unit_ack_string(&acks[0])); - EXPECT_STREQ("client_port 4002, server_port 5002, client_id 102", - unit_ack_string(&acks[1])); - EXPECT_EQ(1, peer->num_acks); - - // Third call: retrieve final id. - EXPECT_EQ(1, homa_peer_get_acks(peer, 2, acks)); - EXPECT_STREQ("client_port 4000, server_port 5000, client_id 100", - unit_ack_string(&acks[0])); -} diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 7f7f20af..8c1a292a 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -1,39 +1,39 @@ -/* Copyright (c) 2019-2023 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" +#include "homa_peer.h" +#include "homa_pool.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" #include "mock.h" #include "utils.h" -extern struct homa *homa; +/* The following hook function frees hook_rpc. */ +static struct homa_rpc *hook_rpc; +static void unlock_hook(char *id) +{ + if (strcmp(id, "unlock") != 0) + return; + if (hook_rpc) { + homa_rpc_end(hook_rpc); + hook_rpc = 0; + } +} FIXTURE(homa_plumbing) { struct in6_addr client_ip[1]; int client_port; struct in6_addr server_ip[1]; int server_port; - __u64 client_id; - __u64 server_id; + u64 client_id; + u64 server_id; struct homa homa; + struct homa_net *hnet; struct homa_sock hsk; - sockaddr_in_union client_addr; - sockaddr_in_union server_addr; - struct data_header data; + union sockaddr_in_union client_addr; + union sockaddr_in_union server_addr; + struct homa_data_hdr data; int starting_skb_count; struct msghdr recvmsg_hdr; struct homa_recvmsg_args recvmsg_args; @@ -42,7 +42,7 @@ FIXTURE(homa_plumbing) { struct homa_sendmsg_args sendmsg_args; char buffer[2000]; sockptr_t optval; - sockaddr_in_union addr; + union sockaddr_in_union addr; }; FIXTURE_SETUP(homa_plumbing) { @@ -56,9 +56,11 @@ FIXTURE_SETUP(homa_plumbing) self->client_addr.in6.sin6_port = htons(self->client_port); self->server_addr.in6.sin6_addr = self->server_ip[0]; self->server_addr.in6.sin6_port = htons(self->server_port); - homa = &self->homa; homa_init(&self->homa); - mock_sock_init(&self->hsk, &self->homa, 0); + if (self->homa.wmem_max == 0) + printf("homa_plumbing fixture found wmem_max 0\n"); + self->hnet = mock_hnet(0, &self->homa); + mock_sock_init(&self->hsk, self->hnet, 0); self->client_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; self->server_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; if (self->hsk.inet.sk.sk_family == AF_INET) { @@ -67,24 +69,25 @@ FIXTURE_SETUP(homa_plumbing) self->server_addr.in4.sin_addr.s_addr = ipv6_to_ipv4(self->server_addr.in6.sin6_addr); } - homa_sock_bind(&self->homa.port_map, &self->hsk, self->server_port); - self->data = (struct data_header){.common = { - .sport = htons(self->client_port), - .dport = htons(self->server_port), - .type = DATA, - .sender_id = cpu_to_be64(self->client_id)}, - .message_length = htonl(10000), - .incoming = htonl(10000), .retransmit = 0, - .seg={.offset = 0}}; + homa_sock_bind(self->hnet, &self->hsk, self->server_port); + memset(&self->data, 0, sizeof(self->data)); + self->data = (struct homa_data_hdr){.common = { + .sport = htons(self->client_port), + .dport = htons(self->server_port), + .type = DATA, + .sender_id = cpu_to_be64(self->client_id)}, + .message_length = htonl(10000), +#ifndef __STRIP__ /* See strip.py */ + .incoming = htonl(10000), +#endif /* See strip.py */ + }; self->recvmsg_args.id = 0; self->recvmsg_hdr.msg_name = &self->addr; self->recvmsg_hdr.msg_namelen = 0; self->recvmsg_hdr.msg_control = &self->recvmsg_args; self->recvmsg_hdr.msg_controllen = sizeof(self->recvmsg_args); - self->recvmsg_hdr.msg_flags = 0; + self->recvmsg_hdr.msg_flags = MSG_DONTWAIT; memset(&self->recvmsg_args, 0, sizeof(self->recvmsg_args)); - self->recvmsg_args.flags = HOMA_RECVMSG_REQUEST - | HOMA_RECVMSG_RESPONSE | HOMA_RECVMSG_NONBLOCKING; self->send_vec[0].iov_base = self->buffer; self->send_vec[0].iov_len = 100; self->send_vec[1].iov_base = self->buffer + 1000; @@ -101,119 +104,184 @@ FIXTURE_SETUP(homa_plumbing) self->optval.user = (void *) 0x100000; self->optval.is_kernel = 0; unit_log_clear(); + if (self->homa.wmem_max == 0) + printf("homa_plumbing fixture set wmem_max 0\n"); } FIXTURE_TEARDOWN(homa_plumbing) { homa_destroy(&self->homa); unit_teardown(); - homa = NULL; +} + +TEST_F(homa_plumbing, homa_load__error_in_inet6_register_protosw) +{ + homa_destroy(&self->homa); + + /* First attempt fails. */ + mock_register_protosw_errors = 1; + EXPECT_EQ(EINVAL, -homa_load()); + + /* Second attempt succeeds. */ + EXPECT_EQ(0, -homa_load()); + + homa_unload(); +} + +TEST_F(homa_plumbing, homa_net_exit__free_peers) +{ + struct in6_addr addr1 = unit_get_in_addr("1.2.3.4"); + struct in6_addr addr2 = unit_get_in_addr("1.2.3.5"); + struct in6_addr addr3 = unit_get_in_addr("1.2.3.6"); + + homa_peer_release(homa_peer_get(&self->hsk, &addr1)); + homa_peer_release(homa_peer_get(&self->hsk, &addr2)); + homa_peer_release(homa_peer_get(&self->hsk, &addr3)); + + EXPECT_EQ(3, unit_count_peers(&self->homa)); + homa_net_exit(mock_net_for_hnet(self->hsk.hnet)); + EXPECT_EQ(0, unit_count_peers(&self->homa)); } TEST_F(homa_plumbing, homa_bind__version_mismatch) { + struct sockaddr addr = {}; + struct socket sock = {}; + int result; + // Make sure the test uses IPv4. mock_ipv6 = false; - homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); - - struct sockaddr addr = {}; + unit_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, self->hnet, 0); addr.sa_family = AF_INET6; - struct socket sock = {}; sock.sk = &self->hsk.inet.sk; - int result = homa_bind(&sock, &addr, sizeof(addr)); + result = homa_bind(&sock, &addr, sizeof(addr)); EXPECT_EQ(EAFNOSUPPORT, -result); + EXPECT_STREQ("address family in bind address didn't match socket", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_bind__ipv6_address_too_short) { + union sockaddr_in_union addr = {}; + struct socket sock = {}; + int result; + // Make sure the test uses IPv6. mock_ipv6 = true; - homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); + unit_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, self->hnet, 0); - sockaddr_in_union addr = {}; addr.in6.sin6_family = AF_INET6; - struct socket sock = {}; sock.sk = &self->hsk.inet.sk; - int result = homa_bind(&sock, &addr.sa, sizeof(addr.in6)-1); + result = homa_bind(&sock, &addr.sa, sizeof(addr.in6)-1); EXPECT_EQ(EINVAL, -result); + EXPECT_STREQ("ipv6 address too short", self->hsk.error_msg); } TEST_F(homa_plumbing, homa_bind__ipv6_ok) { + union sockaddr_in_union addr = {}; + struct socket sock = {}; + int result; + // Make sure the test uses IPv6. mock_ipv6 = true; - homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); + unit_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, self->hnet, 0); + self->hsk.is_server = false; - sockaddr_in_union addr = {}; addr.in6.sin6_family = AF_INET6; addr.in6.sin6_port = htons(123); - struct socket sock = {}; sock.sk = &self->hsk.inet.sk; - int result = homa_bind(&sock, &addr.sa, sizeof(addr.in6)); + result = homa_bind(&sock, &addr.sa, sizeof(addr.in6)); EXPECT_EQ(0, -result); EXPECT_EQ(123, self->hsk.port); + EXPECT_EQ(1, self->hsk.is_server); } TEST_F(homa_plumbing, homa_bind__ipv4_address_too_short) { + union sockaddr_in_union addr = {}; + struct socket sock = {}; + int result; + // Make sure the test uses IPv4. mock_ipv6 = false; - homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); + unit_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, self->hnet, 0); - sockaddr_in_union addr = {}; addr.in4.sin_family = AF_INET; - struct socket sock = {}; sock.sk = &self->hsk.inet.sk; - int result = homa_bind(&sock, &addr.sa, sizeof(addr.in4)-1); + result = homa_bind(&sock, &addr.sa, sizeof(addr.in4)-1); EXPECT_EQ(EINVAL, -result); + EXPECT_STREQ("ipv4 address too short", self->hsk.error_msg); } TEST_F(homa_plumbing, homa_bind__ipv4_ok) { + union sockaddr_in_union addr = {}; + struct socket sock = {}; + int result; + // Make sure the test uses IPv4. mock_ipv6 = false; - homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); + unit_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, self->hnet, 0); + self->hsk.is_server = false; - sockaddr_in_union addr = {}; addr.in4.sin_family = AF_INET; addr.in4.sin_port = htons(345); - struct socket sock = {}; sock.sk = &self->hsk.inet.sk; - int result = homa_bind(&sock, &addr.sa, sizeof(addr.in4)); + result = homa_bind(&sock, &addr.sa, sizeof(addr.in4)); EXPECT_EQ(0, -result); EXPECT_EQ(345, self->hsk.port); + EXPECT_EQ(1, self->hsk.is_server); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_plumbing, homa_ioc_abort__basics) { - struct homa_abort_args args = {self->client_id, 0}; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 10000, 200); + struct homa_abort_args args = {self->client_id, 0}; + ASSERT_NE(NULL, crpc); - EXPECT_EQ(0, homa_ioc_abort(&self->hsk.inet.sk, (unsigned long) &args)); + EXPECT_EQ(0, homa_ioc_abort(self->hsk.sock.sk_socket, + (unsigned long) &args)); EXPECT_EQ(RPC_DEAD, crpc->state); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } TEST_F(homa_plumbing, homa_ioc_abort__cant_read_user_args) { struct homa_abort_args args = {self->client_id, 0}; + mock_copy_data_errors = 1; - EXPECT_EQ(EFAULT, -homa_ioc_abort(&self->hsk.inet.sk, - (unsigned long) &args)); + EXPECT_EQ(EFAULT, -homa_ioc_abort(self->hsk.sock.sk_socket, + (unsigned long) &args)); + EXPECT_STREQ("invalid address for homa_abort_args", + self->hsk.error_msg); +} +TEST_F(homa_plumbing, homa_ioc_abort__nonzero_reserved_fields) +{ + struct homa_abort_args args; + + args._pad1 = 777; + EXPECT_EQ(EINVAL, -homa_ioc_abort(self->hsk.sock.sk_socket, + (unsigned long) &args)); + EXPECT_STREQ("reserved fields in homa_abort_args must be zero", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_ioc_abort__abort_multiple_rpcs) { - struct homa_abort_args args = {0, ECANCELED}; struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 10000, 200); struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 10000, 200); + struct homa_abort_args args = {0, ECANCELED}; + ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); - EXPECT_EQ(0, homa_ioc_abort(&self->hsk.inet.sk, (unsigned long) &args)); + EXPECT_EQ(0, homa_ioc_abort(self->hsk.sock.sk_socket, + (unsigned long) &args)); EXPECT_EQ(-ECANCELED, crpc1->error); EXPECT_EQ(-ECANCELED, crpc2->error); EXPECT_EQ(2, unit_list_length(&self->hsk.active_rpcs)); @@ -221,71 +289,473 @@ TEST_F(homa_plumbing, homa_ioc_abort__abort_multiple_rpcs) TEST_F(homa_plumbing, homa_ioc_abort__nonexistent_rpc) { struct homa_abort_args args = {99, 0}; - EXPECT_EQ(EINVAL, -homa_ioc_abort(&self->hsk.inet.sk, - (unsigned long) &args)); + + EXPECT_EQ(EINVAL, -homa_ioc_abort(self->hsk.sock.sk_socket, + (unsigned long) &args)); + EXPECT_STREQ("RPC identifier did not match any existing RPC", + self->hsk.error_msg); +} +#endif /* See strip.py */ + +TEST_F(homa_plumbing, homa_ioc_info__cant_read_homa_info_from_user_space) +{ + struct homa_info hinfo; + + mock_copy_data_errors = 1; + EXPECT_EQ(EFAULT, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); + EXPECT_STREQ("invalid address for homa_info", self->hsk.error_msg); +} +TEST_F(homa_plumbing, homa_ioc_info__basics) +{ + struct homa_info hinfo; + + memset(&hinfo, 0, sizeof(hinfo)); + EXPECT_EQ(0, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); + EXPECT_EQ(100 * HOMA_BPAGE_SIZE, hinfo.bpool_avail_bytes); + EXPECT_EQ(99, hinfo.port); } +TEST_F(homa_plumbing, homa_ioc_info__socket_shutdown) +{ + struct homa_info hinfo; + struct homa_sock hsk; + + mock_sock_init(&hsk, self->hnet, self->server_port); + homa_sock_shutdown(&hsk); + + EXPECT_EQ(ESHUTDOWN, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); + EXPECT_STREQ("socket has been shut down", hsk.error_msg); + unit_sock_destroy(&hsk); +} +TEST_F(homa_plumbing, homa_ioc_info__rpc_info) +{ + struct homa_rpc_info info[10]; + struct homa_info hinfo; + + unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 2000, 100); + unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, + self->server_ip, self->client_port, self->server_id + 2, + 5000, 2000); + hinfo.rpc_info = info; + hinfo.rpc_info_length = sizeof(info); + + EXPECT_EQ(0, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); + EXPECT_EQ(2, hinfo.num_rpcs); + EXPECT_EQ(self->server_id, info[0].id); + EXPECT_EQ(self->server_id + 2, info[1].id); +} +TEST_F(homa_plumbing, homa_ioc_info__ignore_dead_rpc) +{ + struct homa_rpc_info info[10]; + struct homa_info hinfo; + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 2000, 100); + EXPECT_EQ(RPC_IN_SERVICE, srpc->state); + srpc->state = RPC_DEAD; + unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, + self->server_ip, self->client_port, self->server_id + 2, + 5000, 2000); + hinfo.rpc_info = info; + hinfo.rpc_info_length = sizeof(info); + + EXPECT_EQ(0, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); + EXPECT_EQ(1, hinfo.num_rpcs); + EXPECT_EQ(self->server_id + 2, info[0].id); + srpc->state = RPC_IN_SERVICE; +} +TEST_F(homa_plumbing, homa_ioc_info__no_memory_for_rpc_info) +{ + struct homa_info hinfo; -TEST_F(homa_plumbing, homa_set_sock_opt__bad_level) + unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 2000, 100); + unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, + self->server_ip, self->client_port, self->server_id + 2, + 5000, 2000); + hinfo.rpc_info = NULL; + hinfo.rpc_info_length = 1000; + + EXPECT_EQ(0, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); + EXPECT_EQ(2, hinfo.num_rpcs); +} +TEST_F(homa_plumbing, homa_ioc_info__not_enough_space_for_all_rpcs) { - EXPECT_EQ(EINVAL, -homa_setsockopt(&self->hsk.sock, 0, 0, - self->optval, sizeof(struct homa_set_buf_args))); + struct homa_rpc_info info[10]; + struct homa_info hinfo; + + unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 2000, 100); + unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, + self->server_ip, self->client_port, self->server_id + 2, + 5000, 2000); + memset(info, 0, sizeof(info)); + hinfo.rpc_info = info; + hinfo.rpc_info_length = sizeof(*info); + + EXPECT_EQ(0, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); + EXPECT_EQ(2, hinfo.num_rpcs); + EXPECT_EQ(self->server_id, info[0].id); + EXPECT_EQ(0, info[1].id); } -TEST_F(homa_plumbing, homa_set_sock_opt__bad_optname) +TEST_F(homa_plumbing, homa_ioc_info__cant_copy_rpc_info_to_user) { - EXPECT_EQ(EINVAL, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, 0, - self->optval, sizeof(struct homa_set_buf_args))); + struct homa_rpc_info info[10]; + struct homa_info hinfo; + + unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 2000, 100); + unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, + self->server_ip, self->client_port, self->server_id + 2, + 5000, 2000); + memset(info, 0, sizeof(info)); + hinfo.rpc_info = info; + hinfo.rpc_info_length = sizeof(info); + + mock_copy_to_user_errors = 2; + EXPECT_EQ(EFAULT, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); + EXPECT_STREQ("couldn't copy homa_rpc_info to user space: invalid or read-only address?", + self->hsk.error_msg); + EXPECT_EQ(self->server_id, info[0].id); + EXPECT_EQ(0, info[1].id); } -TEST_F(homa_plumbing, homa_set_sock_opt__bad_optlen) +TEST_F(homa_plumbing, homa_ioc_info__error_msg) +{ + struct homa_info hinfo; + + /* First call: no error message. */ + strcpy(hinfo.error_msg, "Bogus message"); + EXPECT_EQ(0, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); + EXPECT_STREQ("", hinfo.error_msg); + + /* Second call: there is a message. */ + self->hsk.error_msg = "Sample error message"; + EXPECT_EQ(0, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); + EXPECT_STREQ("Sample error message", hinfo.error_msg); + + /* Third call: the message is too long. */ + self->hsk.error_msg = "This message is very long; " + "a lot longer than you might think; " + "so long that it exceeds the available space " + "for storing message in struct homa_info"; + EXPECT_EQ(0, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); + EXPECT_EQ(HOMA_ERROR_MSG_SIZE - 1, strlen(hinfo.error_msg)); +} +TEST_F(homa_plumbing, homa_ioc_info__cant_copy_back_to_user_space) +{ + struct homa_info hinfo; + + mock_copy_to_user_errors = 1; + EXPECT_EQ(EFAULT, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); + EXPECT_STREQ("couldn't copy homa_info to user space: read-only address?", + self->hsk.error_msg); +} + +TEST_F(homa_plumbing, homa_ioctl__HOMAIOCINFO) +{ + struct homa_info hinfo; + + hinfo.rpc_info = NULL; + self->hsk.error_msg = "Sample error message"; + EXPECT_EQ(0, -homa_ioctl(self->hsk.sock.sk_socket, HOMAIOCINFO, + (unsigned long) &hinfo)); + EXPECT_STREQ("Sample error message", hinfo.error_msg); +} +TEST_F(homa_plumbing, homa_ioctl__unknown_ioctl_command) +{ + EXPECT_EQ(EINVAL, -homa_ioctl(self->hsk.sock.sk_socket, 47, 0)); + EXPECT_STREQ("ioctl opcode isn't supported by Homa", + self->hsk.error_msg); +} + +TEST_F(homa_plumbing, homa_socket__success) +{ + struct homa_sock hsk; + + memset(&hsk, 0, sizeof(hsk)); + hsk.sock.sk_net.net = mock_net_for_hnet(self->hnet); + refcount_set(&hsk.sock.sk_wmem_alloc, 1); + EXPECT_EQ(0, homa_socket(&hsk.sock)); + unit_sock_destroy(&hsk); +} +TEST_F(homa_plumbing, homa_socket__homa_sock_init_failure) +{ + struct homa_sock hsk; + + memset(&hsk, 0, sizeof(hsk)); + hsk.sock.sk_net.net = mock_net_for_hnet(self->hnet); + refcount_set(&hsk.sock.sk_wmem_alloc, 1); + mock_kmalloc_errors = 1; + EXPECT_EQ(ENOMEM, -homa_socket(&hsk.sock)); +} + +TEST_F(homa_plumbing, homa_setsockopt__bad_level) +{ + EXPECT_EQ(ENOPROTOOPT, -homa_setsockopt(&self->hsk.sock, 0, 0, + self->optval, sizeof(struct homa_rcvbuf_args))); + EXPECT_STREQ("homa_setsockopt invoked with level not IPPROTO_HOMA", + self->hsk.error_msg); +} +TEST_F(homa_plumbing, homa_setsockopt__recvbuf_bad_optlen) { EXPECT_EQ(EINVAL, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, - SO_HOMA_SET_BUF, self->optval, - sizeof(struct homa_set_buf_args) - 1)); + SO_HOMA_RCVBUF, self->optval, + sizeof(struct homa_rcvbuf_args) - 1)); + EXPECT_STREQ("invalid optlen argument: must be sizeof(struct homa_rcvbuf_args)", + self->hsk.error_msg); } -TEST_F(homa_plumbing, homa_set_sock_opt__copy_from_sockptr_fails) +TEST_F(homa_plumbing, homa_setsockopt__recvbuf_copy_from_sockptr_fails) { mock_copy_data_errors = 1; EXPECT_EQ(EFAULT, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, - SO_HOMA_SET_BUF, self->optval, - sizeof(struct homa_set_buf_args))); + SO_HOMA_RCVBUF, self->optval, + sizeof(struct homa_rcvbuf_args))); + EXPECT_STREQ("invalid address for homa_rcvbuf_args", + self->hsk.error_msg); } -TEST_F(homa_plumbing, homa_set_sock_opt__copy_to_user_fails) +TEST_F(homa_plumbing, homa_setsockopt__recvbuf_region_not_writable) { - struct homa_set_buf_args args = {(void *) 0x100000, 5*HOMA_BPAGE_SIZE}; + struct homa_rcvbuf_args args = {0x100000, 5*HOMA_BPAGE_SIZE}; + self->optval.user = &args; mock_copy_to_user_errors = 1; EXPECT_EQ(EFAULT, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, - SO_HOMA_SET_BUF, self->optval, - sizeof(struct homa_set_buf_args))); + SO_HOMA_RCVBUF, self->optval, + sizeof(struct homa_rcvbuf_args))); + EXPECT_STREQ("receive buffer region is not writable", + self->hsk.error_msg); } -TEST_F(homa_plumbing, homa_set_sock_opt__success) +TEST_F(homa_plumbing, homa_setsockopt__recvbuf_success) { - struct homa_set_buf_args args; + struct homa_rcvbuf_args args; char buffer[5000]; - args.start = (void *) (((__u64) (buffer + PAGE_SIZE - 1)) + args.start = (((uintptr_t)(buffer + PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1)); - args.length = 5*HOMA_BPAGE_SIZE; + args.length = 64*HOMA_BPAGE_SIZE; self->optval.user = &args; + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); EXPECT_EQ(0, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, - SO_HOMA_SET_BUF, self->optval, - sizeof(struct homa_set_buf_args))); - EXPECT_EQ(args.start, self->hsk.buffer_pool.region); - EXPECT_EQ(5, self->hsk.buffer_pool.num_bpages); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.so_set_buf_calls); + SO_HOMA_RCVBUF, self->optval, + sizeof(struct homa_rcvbuf_args))); + EXPECT_EQ(args.start, (uintptr_t)self->hsk.buffer_pool->region); + EXPECT_EQ(64, self->hsk.buffer_pool->num_bpages); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(1, homa_metrics_per_cpu()->so_set_buf_calls); +#endif /* See strip.py */ +} +TEST_F(homa_plumbing, homa_setsockopt__server_bad_optlen) +{ + EXPECT_EQ(EINVAL, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_SERVER, self->optval, sizeof(int) - 1)); + EXPECT_STREQ("invalid optlen argument: must be sizeof(int)", + self->hsk.error_msg); } +TEST_F(homa_plumbing, homa_setsockopt__server_copy_from_sockptr_fails) +{ + mock_copy_data_errors = 1; + EXPECT_EQ(EFAULT, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_SERVER, self->optval, sizeof(int))); + EXPECT_STREQ("invalid address for SO_HOMA_SERVER value", + self->hsk.error_msg); +} +TEST_F(homa_plumbing, homa_setsockopt__server_success) +{ + int arg = 7; + + self->optval.user = &arg; + EXPECT_EQ(0, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_SERVER, self->optval, sizeof(int))); + EXPECT_EQ(1, self->hsk.is_server); + + arg = 0; + EXPECT_EQ(0, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_SERVER, self->optval, sizeof(int))); + EXPECT_EQ(0, self->hsk.is_server); +} +TEST_F(homa_plumbing, homa_setsockopt__bad_optname) +{ + EXPECT_EQ(ENOPROTOOPT, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, 0, + self->optval, sizeof(struct homa_rcvbuf_args))); + EXPECT_STREQ("setsockopt option not supported by Homa", + self->hsk.error_msg); +} + +TEST_F(homa_plumbing, homa_getsockopt__recvbuf_success) +{ + struct homa_rcvbuf_args val; + int size = sizeof(val) + 10; -TEST_F(homa_plumbing, homa_sendmsg__args_not_in_user_space) + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); + EXPECT_EQ(0, -homa_pool_set_region(&self->hsk, (void *)0x40000, + 10*HOMA_BPAGE_SIZE + 1000)); + EXPECT_EQ(0, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_RCVBUF, (char *)&val, &size)); + EXPECT_EQ(0x40000, val.start); + EXPECT_EQ(10*HOMA_BPAGE_SIZE, val.length); + EXPECT_EQ(sizeof(val), size); +} +TEST_F(homa_plumbing, homa_getsockopt__cant_read_size) +{ + struct homa_rcvbuf_args val; + int size = sizeof(val); + + mock_copy_data_errors = 1; + EXPECT_EQ(EFAULT, -homa_getsockopt(&self->hsk.sock, 0, SO_HOMA_RCVBUF, + (char *)&val, &size)); + EXPECT_STREQ("invalid address for optlen argument to getsockopt", + self->hsk.error_msg); +} +TEST_F(homa_plumbing, homa_getsockopt__bad_level) +{ + struct homa_rcvbuf_args val; + int size = sizeof(val); + + EXPECT_EQ(ENOPROTOOPT, -homa_getsockopt(&self->hsk.sock, 0, SO_HOMA_RCVBUF, + (char *)&val, &size)); + EXPECT_STREQ("homa_setsockopt invoked with level not IPPROTO_HOMA", + self->hsk.error_msg); +} +TEST_F(homa_plumbing, homa_getsockopt__recvbuf_bad_length) +{ + struct homa_rcvbuf_args val; + int size = sizeof(val) - 1; + + EXPECT_EQ(EINVAL, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_RCVBUF, (char *)&val, &size)); + EXPECT_STREQ("invalid optlen argument: must be sizeof(struct homa_rcvbuf_args)", + self->hsk.error_msg); +} +TEST_F(homa_plumbing, homa_getsockopt__server_bad_length) +{ + int is_server; + int size = sizeof(is_server) - 1; + + EXPECT_EQ(EINVAL, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_SERVER, (char *)&is_server, &size)); + EXPECT_STREQ("invalid optlen argument: must be sizeof(int)", + self->hsk.error_msg); +} +TEST_F(homa_plumbing, homa_getsockopt__server_success) +{ + int is_server; + int size = sizeof(is_server); + + self->hsk.is_server = 1; + EXPECT_EQ(0, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_SERVER, (char *)&is_server, &size)); + EXPECT_EQ(1, is_server); + EXPECT_EQ(sizeof(int), size); + + self->hsk.is_server = 0; + size = 20; + EXPECT_EQ(0, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_SERVER, (char *)&is_server, &size)); + EXPECT_EQ(0, is_server); + EXPECT_EQ(sizeof(int), size); +} +TEST_F(homa_plumbing, homa_getsockopt__bad_optname) +{ + struct homa_rcvbuf_args val; + int size = sizeof(val); + + EXPECT_EQ(ENOPROTOOPT, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_RCVBUF-1, (char *)&val, &size)); + EXPECT_STREQ("getsockopt option not supported by Homa", + self->hsk.error_msg); +} +TEST_F(homa_plumbing, homa_getsockopt__cant_copy_out_size) +{ + struct homa_rcvbuf_args val = {.start = 0, .length = 0}; + int size = sizeof(val) + 10; + + mock_copy_to_user_errors = 1; + + EXPECT_EQ(EFAULT, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_RCVBUF, (char *)&val, &size)); + EXPECT_STREQ("couldn't update optlen argument to getsockopt: read-only?", + self->hsk.error_msg); + EXPECT_EQ(0, val.start); + EXPECT_EQ(sizeof(val) + 10, size); +} +TEST_F(homa_plumbing, homa_getsockopt__cant_copy_out_value) +{ + struct homa_rcvbuf_args val = {.start = 0, .length = 0}; + int size = sizeof(val) + 10; + + mock_copy_to_user_errors = 2; + + EXPECT_EQ(EFAULT, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_RCVBUF, (char *)&val, &size)); + EXPECT_STREQ("couldn't update optval argument to getsockopt: read-only?", + self->hsk.error_msg); + EXPECT_EQ(0, val.start); + EXPECT_EQ(sizeof(val), size); +} + +TEST_F(homa_plumbing, homa_sendmsg__msg_name_null) +{ + self->sendmsg_hdr.msg_name = NULL; + EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, + &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("no msg_name passed to sendmsg", + self->hsk.error_msg); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); +} +TEST_F(homa_plumbing, homa_sendmsg__msg_control_not_in_user_space) { self->sendmsg_hdr.msg_control_is_user = 0; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("msg_control argument for sendmsg isn't in user space", + self->hsk.error_msg); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } -TEST_F(homa_plumbing, homa_sendmsg__cant_read_args) +TEST_F(homa_plumbing, homa_sendmsg__cant_read_msg_control) { mock_copy_data_errors = 1; EXPECT_EQ(EFAULT, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("invalid address for msg_control argument to sendmsg", + self->hsk.error_msg); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); +} +TEST_F(homa_plumbing, homa_sendmsg__illegal_flag) +{ + self->sendmsg_args.flags = 4; + EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, + &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("reserved fields in homa_sendmsg_args must be zero", + self->hsk.error_msg); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); +} +TEST_F(homa_plumbing, homa_sendmsg__nonzero_reserved_field) +{ + self->sendmsg_args.reserved = 0x1000; + EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, + &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("reserved fields in homa_sendmsg_args must be zero", + self->hsk.error_msg); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } TEST_F(homa_plumbing, homa_sendmsg__bad_address_family) @@ -293,6 +763,8 @@ TEST_F(homa_plumbing, homa_sendmsg__bad_address_family) self->client_addr.in4.sin_family = 1; EXPECT_EQ(EAFNOSUPPORT, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("address family in sendmsg address must match the socket", + self->hsk.error_msg); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } TEST_F(homa_plumbing, homa_sendmsg__address_too_short) @@ -302,6 +774,7 @@ TEST_F(homa_plumbing, homa_sendmsg__address_too_short) self->sendmsg_hdr.msg_namelen = sizeof(struct sockaddr_in) - 1; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("msg_namelen too short", self->hsk.error_msg); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); self->client_addr.in4.sin_family = AF_INET6; @@ -309,20 +782,25 @@ TEST_F(homa_plumbing, homa_sendmsg__address_too_short) self->sendmsg_hdr.msg_namelen = sizeof(struct sockaddr_in6) - 1; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("msg_namelen too short", self->hsk.error_msg); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } -TEST_F(homa_plumbing, homa_sendmsg__error_in_homa_rpc_new_client) +TEST_F(homa_plumbing, homa_sendmsg__error_in_homa_rpc_alloc_client) { mock_kmalloc_errors = 2; EXPECT_EQ(ENOMEM, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("couldn't allocate memory for homa_peer", + self->hsk.error_msg); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } -TEST_F(homa_plumbing, homa_sendmsg__error_in_homa_message_out_init) +TEST_F(homa_plumbing, homa_sendmsg__error_in_homa_message_out_fill) { self->sendmsg_hdr.msg_iter.count = HOMA_MAX_MESSAGE_LENGTH+1; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("message length exceeded HOMA_MAX_MESSAGE_LENGTH", + self->hsk.error_msg); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } TEST_F(homa_plumbing, homa_sendmsg__cant_update_user_arguments) @@ -331,12 +809,15 @@ TEST_F(homa_plumbing, homa_sendmsg__cant_update_user_arguments) atomic64_set(&self->homa.next_outgoing_id, 1234); EXPECT_EQ(EFAULT, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("couldn't update homa_sendmsg_args argument to sendmsg: read-only?", + self->hsk.error_msg); EXPECT_SUBSTR("xmit DATA 200@0", unit_log_get()); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } TEST_F(homa_plumbing, homa_sendmsg__request_sent_successfully) { struct homa_rpc *crpc; + atomic64_set(&self->homa.next_outgoing_id, 1234); self->sendmsg_args.completion_cookie = 88888; EXPECT_EQ(0, -homa_sendmsg(&self->hsk.inet.sk, @@ -344,20 +825,32 @@ TEST_F(homa_plumbing, homa_sendmsg__request_sent_successfully) EXPECT_SUBSTR("xmit DATA 200@0", unit_log_get()); EXPECT_EQ(1234L, self->sendmsg_args.id); ASSERT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); - crpc = homa_find_client_rpc(&self->hsk, self->sendmsg_args.id); + crpc = homa_rpc_find_client(&self->hsk, self->sendmsg_args.id); ASSERT_NE(NULL, crpc); EXPECT_EQ(88888, crpc->completion_cookie); homa_rpc_unlock(crpc); } +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_plumbing, homa_sendmsg__request_metrics) +{ + EXPECT_EQ(0, -homa_sendmsg(&self->hsk.inet.sk, + &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_EQ(1, homa_metrics_per_cpu()->client_requests_started); + EXPECT_EQ(200, homa_metrics_per_cpu()->client_request_bytes_started); +} +#endif /* See strip.py */ TEST_F(homa_plumbing, homa_sendmsg__response_nonzero_completion_cookie) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, - self->server_id, 2000, 100); + self->server_id, 2000, 100); + self->sendmsg_args.id = self->server_id; self->sendmsg_args.completion_cookie = 12345; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("completion_cookie must be zero when sending responses", + self->hsk.error_msg); EXPECT_EQ(RPC_IN_SERVICE, srpc->state); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); } @@ -365,9 +858,10 @@ TEST_F(homa_plumbing, homa_sendmsg__response_cant_find_rpc) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, - self->server_id, 2000, 100); + self->server_id, 2000, 100); + self->sendmsg_args.id = self->server_id + 1; - EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, + EXPECT_EQ(0, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); EXPECT_EQ(RPC_IN_SERVICE, srpc->state); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); @@ -376,11 +870,14 @@ TEST_F(homa_plumbing, homa_sendmsg__response_error_in_rpc) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, - self->server_id, 2000, 100); + self->server_id, 2000, 100); + self->sendmsg_args.id = srpc->id; srpc->error = -ENOMEM; EXPECT_EQ(ENOMEM, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("RPC has failed, so can't send response", + self->hsk.error_msg); EXPECT_EQ(RPC_DEAD, srpc->state); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } @@ -388,216 +885,325 @@ TEST_F(homa_plumbing, homa_sendmsg__response_wrong_state) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, - self->server_id, 2000, 100); + self->server_id, 2000, 100); + self->sendmsg_args.id = self->server_id; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("RPC is not in a state where a response can be sent", + self->hsk.error_msg); EXPECT_EQ(RPC_INCOMING, srpc->state); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); } -TEST_F(homa_plumbing, homa_sendmsg__homa_message_out_init_returns_error) +TEST_F(homa_plumbing, homa_sendmsg__homa_message_out_fill_returns_error) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, - self->server_id, 2000, 100); + self->server_id, 2000, 100); + self->sendmsg_args.id = self->server_id; self->sendmsg_hdr.msg_iter.count = HOMA_MAX_MESSAGE_LENGTH + 1; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("message length exceeded HOMA_MAX_MESSAGE_LENGTH", + self->hsk.error_msg); + EXPECT_EQ(RPC_DEAD, srpc->state); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); +} +TEST_F(homa_plumbing, homa_sendmsg__rpc_freed_during_homa_message_out_fill) +{ + struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 2000, 100); + + unit_hook_register(unlock_hook); + hook_rpc = srpc; + self->sendmsg_args.id = self->server_id; + EXPECT_EQ(0, -homa_sendmsg(&self->hsk.inet.sk, + &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); EXPECT_EQ(RPC_DEAD, srpc->state); + EXPECT_EQ(0, srpc->msgout.num_skbs); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } TEST_F(homa_plumbing, homa_sendmsg__response_succeeds) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, - self->server_id, 2000, 100); + self->server_id, 2000, 100); + self->sendmsg_args.id = self->server_id; EXPECT_EQ(0, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); EXPECT_EQ(RPC_OUTGOING, srpc->state); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); } +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_plumbing, homa_sendmsg__response_metrics) +{ + unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 2000, 100); + self->sendmsg_args.id = self->server_id; + EXPECT_EQ(0, -homa_sendmsg(&self->hsk.inet.sk, + &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_EQ(1, homa_metrics_per_cpu()->server_responses_started); + EXPECT_EQ(200, homa_metrics_per_cpu()->server_response_bytes_started); +} +#endif /* See strip.py */ TEST_F(homa_plumbing, homa_recvmsg__wrong_args_length) { self->recvmsg_hdr.msg_controllen -= 1; EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("invalid msg_controllen in recvmsg", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_recvmsg__cant_read_args) { mock_copy_data_errors = 1; EXPECT_EQ(EFAULT, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("invalid address for msg_control argument to recvmsg", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_recvmsg__clear_cookie) { // Make sure that the completion_cookie will be zero if anything // goes wrong with the receive. - self->recvmsg_args._pad[0] = 1; self->recvmsg_args.completion_cookie = 12345; + self->recvmsg_args.num_bpages = 1000000; EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(0, self->recvmsg_args.completion_cookie); } -TEST_F(homa_plumbing, homa_recvmsg__pad_not_zero) +TEST_F(homa_plumbing, homa_recvmsg__num_bpages_too_large) { - self->recvmsg_args._pad[0] = 1; + self->recvmsg_args.num_bpages = HOMA_MAX_BPAGES + 1; EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("num_pages exceeds HOMA_MAX_BPAGES", self->hsk.error_msg); } -TEST_F(homa_plumbing, homa_recvmsg__num_bpages_too_large) +TEST_F(homa_plumbing, homa_recvmsg__reserved_not_zero) { - self->recvmsg_args.num_bpages = HOMA_MAX_BPAGES + 1; + self->recvmsg_args.reserved = 1; EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("reserved fields in homa_recvmsg_args must be zero", + self->hsk.error_msg); } -TEST_F(homa_plumbing, homa_recvmsg__bogus_flags) +TEST_F(homa_plumbing, homa_recvmsg__no_buffer_pool) { - self->recvmsg_args.flags = 1 << 10; + struct homa_pool *saved_pool = self->hsk.buffer_pool; + + self->hsk.buffer_pool = NULL; EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("SO_HOMA_RECVBUF socket option has not been set", + self->hsk.error_msg); + self->hsk.buffer_pool = saved_pool; } TEST_F(homa_plumbing, homa_recvmsg__release_buffers) { - EXPECT_EQ(0, -homa_pool_init(&self->hsk.buffer_pool, &self->homa, - (char *) 0x1000000, 100*HOMA_BPAGE_SIZE)); - EXPECT_EQ(0, -homa_pool_get_pages(&self->hsk.buffer_pool, 2, + EXPECT_EQ(0, -homa_pool_get_pages(self->hsk.buffer_pool, 2, self->recvmsg_args.bpage_offsets, 0)); - EXPECT_EQ(1, atomic_read(&self->hsk.buffer_pool.descriptors[0].refs)); - EXPECT_EQ(1, atomic_read(&self->hsk.buffer_pool.descriptors[1].refs)); + EXPECT_EQ(1, atomic_read(&self->hsk.buffer_pool->descriptors[0].refs)); + EXPECT_EQ(1, atomic_read(&self->hsk.buffer_pool->descriptors[1].refs)); self->recvmsg_args.num_bpages = 2; self->recvmsg_args.bpage_offsets[0] = 0; self->recvmsg_args.bpage_offsets[1] = HOMA_BPAGE_SIZE; EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); - EXPECT_EQ(0, atomic_read(&self->hsk.buffer_pool.descriptors[0].refs)); - EXPECT_EQ(0, atomic_read(&self->hsk.buffer_pool.descriptors[1].refs)); + 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); + EXPECT_EQ(0, atomic_read(&self->hsk.buffer_pool->descriptors[0].refs)); + EXPECT_EQ(0, atomic_read(&self->hsk.buffer_pool->descriptors[1].refs)); +} +TEST_F(homa_plumbing, homa_recvmsg__error_in_release_buffers) +{ + self->recvmsg_args.num_bpages = 1; + self->recvmsg_args.bpage_offsets[0] = + self->hsk.buffer_pool->num_bpages << HOMA_BPAGE_SHIFT; + + EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, + 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("error while releasing buffer pages", + self->hsk.error_msg); +} +TEST_F(homa_plumbing, homa_recvmsg__private_rpc_doesnt_exist) +{ + self->recvmsg_args.id = 99; + + EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, + 0, 0, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("invalid RPC id passed to recvmsg", + self->hsk.error_msg); +} +TEST_F(homa_plumbing, homa_recvmsg__error_from_homa_wait_private) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->server_port, + self->client_id, 100, 2000); + + EXPECT_NE(NULL, crpc); + set_bit(RPC_PRIVATE, &crpc->flags); + + self->recvmsg_args.id = crpc->id; + + EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, + 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("error while waiting for private RPC to complete", + self->hsk.error_msg); + EXPECT_EQ(0, self->recvmsg_args.id); + EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); +} +TEST_F(homa_plumbing, homa_recvmsg__private_rpc_has_error) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->server_port, + self->client_id, 100, 2000); + + EXPECT_NE(NULL, crpc); + set_bit(RPC_PRIVATE, &crpc->flags); + crpc->error = -ETIMEDOUT; + + self->recvmsg_args.id = crpc->id; + + EXPECT_EQ(ETIMEDOUT, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, + 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("RPC failed", self->hsk.error_msg); + EXPECT_EQ(self->client_id, self->recvmsg_args.id); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } -TEST_F(homa_plumbing, homa_recvmsg__nonblocking_argument) +TEST_F(homa_plumbing, homa_recvmsg__error_from_homa_wait_shared) { - self->recvmsg_args.flags = HOMA_RECVMSG_REQUEST; EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 1, 0, &self->recvmsg_hdr.msg_namelen)); + 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("error while waiting for shared RPC to complete", + self->hsk.error_msg); } -TEST_F(homa_plumbing, homa_recvmsg__error_in_homa_wait_for_message) +TEST_F(homa_plumbing, homa_recvmsg__MSG_DONT_WAIT) { - self->hsk.shutdown = true; - EXPECT_EQ(ESHUTDOWN, -homa_recvmsg(&self->hsk.inet.sk, - &self->recvmsg_hdr, 0, 0, 0, + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->server_port, + self->client_id, 100, 2000); + + EXPECT_NE(NULL, crpc); + + EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, + &self->recvmsg_hdr, 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); - self->hsk.shutdown = false; + EXPECT_STREQ("error while waiting for shared RPC to complete", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_recvmsg__normal_completion_ipv4) { + struct homa_rpc *crpc; + u32 pages[2]; + // Make sure the test uses IPv4. mock_ipv6 = false; - homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); - __u32 pages[2]; + unit_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, self->hnet, 0); - EXPECT_EQ(0, -homa_pool_init(&self->hsk.buffer_pool, &self->homa, - (char *) 0x1000000, 100*HOMA_BPAGE_SIZE)); - EXPECT_EQ(0, -homa_pool_get_pages(&self->hsk.buffer_pool, 2, pages, 0)); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, - self->client_ip, self->server_ip, self->server_port, - self->client_id, 100, 2000); + EXPECT_EQ(0, -homa_pool_get_pages(self->hsk.buffer_pool, 2, pages, 0)); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 100, 2000); EXPECT_NE(NULL, crpc); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); crpc->completion_cookie = 44444; EXPECT_EQ(2000, homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(self->client_id, self->recvmsg_args.id); EXPECT_EQ(44444, self->recvmsg_args.completion_cookie); EXPECT_EQ(AF_INET, self->addr.in4.sin_family); EXPECT_STREQ("1.2.3.4", homa_print_ipv4_addr( self->addr.in4.sin_addr.s_addr)); - EXPECT_EQ(sizeof32(struct sockaddr_in), + EXPECT_EQ(sizeof(struct sockaddr_in), self->recvmsg_hdr.msg_namelen); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_EQ(1, self->recvmsg_args.num_bpages); EXPECT_EQ(2*HOMA_BPAGE_SIZE, self->recvmsg_args.bpage_offsets[0]); - EXPECT_EQ(sizeof(struct homa_recvmsg_args), - (char *) self->recvmsg_hdr.msg_control - - (char *) &self->recvmsg_args); } TEST_F(homa_plumbing, homa_recvmsg__normal_completion_ipv6) { + struct in6_addr server_ip6; + struct homa_rpc *crpc; + // Make sure the test uses IPv6. mock_ipv6 = true; - homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); - struct in6_addr server_ip6 = unit_get_in_addr("1::3:5:7"); + unit_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, self->hnet, 0); + server_ip6 = unit_get_in_addr("1::3:5:7"); - EXPECT_EQ(0, -homa_pool_init(&self->hsk.buffer_pool, &self->homa, - (char *) 0x1000000, 100*HOMA_BPAGE_SIZE)); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, - self->client_ip, &server_ip6, self->server_port, - self->client_id, 100, 2000); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, + &server_ip6, self->server_port, self->client_id, + 100, 2000); EXPECT_NE(NULL, crpc); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); crpc->completion_cookie = 44444; EXPECT_EQ(2000, homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(self->client_id, self->recvmsg_args.id); EXPECT_EQ(44444, self->recvmsg_args.completion_cookie); EXPECT_EQ(AF_INET6, self->addr.in6.sin6_family); EXPECT_STREQ("[1::3:5:7]", homa_print_ipv6_addr( &self->addr.in6.sin6_addr)); - EXPECT_EQ(sizeof32(struct sockaddr_in6), + EXPECT_EQ(sizeof(struct sockaddr_in6), self->recvmsg_hdr.msg_namelen); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_EQ(0, crpc->msgin.num_bpages); } TEST_F(homa_plumbing, homa_recvmsg__rpc_has_error) { - EXPECT_EQ(0, -homa_pool_init(&self->hsk.buffer_pool, &self->homa, - (char *) 0x1000000, 100*HOMA_BPAGE_SIZE)); struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 100, 2000); + + mock_set_ipv6(&self->hsk); EXPECT_NE(NULL, crpc); crpc->completion_cookie = 44444; homa_rpc_abort(crpc, -ETIMEDOUT); EXPECT_EQ(ETIMEDOUT, -homa_recvmsg(&self->hsk.inet.sk, - &self->recvmsg_hdr, 0, 0, 0, + &self->recvmsg_hdr, 0, 0, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("RPC failed", self->hsk.error_msg); EXPECT_EQ(self->client_id, self->recvmsg_args.id); EXPECT_EQ(44444, self->recvmsg_args.completion_cookie); + EXPECT_EQ(AF_INET6, self->addr.in6.sin6_family); + EXPECT_STREQ("1.2.3.4", homa_print_ipv6_addr( + &self->addr.in6.sin6_addr)); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_EQ(0, self->recvmsg_args.num_bpages); } TEST_F(homa_plumbing, homa_recvmsg__add_ack) { - EXPECT_EQ(0, -homa_pool_init(&self->hsk.buffer_pool, &self->homa, - (char *) 0x1000000, 100*HOMA_BPAGE_SIZE)); struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 100, 2000); + struct homa_peer *peer; + EXPECT_NE(NULL, crpc); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); crpc->completion_cookie = 44444; + peer = crpc->peer; EXPECT_EQ(2000, homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); - EXPECT_EQ(1, crpc->peer->num_acks); + 0, 0, &self->recvmsg_hdr.msg_namelen)); + EXPECT_EQ(1, peer->num_acks); } TEST_F(homa_plumbing, homa_recvmsg__server_normal_completion) { - EXPECT_EQ(0, -homa_pool_init(&self->hsk.buffer_pool, &self->homa, - (char *) 0x1000000, 100*HOMA_BPAGE_SIZE)); struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 200); - EXPECT_NE(NULL, srpc); + self->server_id, 100, 200); + EXPECT_NE(NULL, srpc); EXPECT_EQ(100, homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(self->server_id, self->recvmsg_args.id); EXPECT_EQ(RPC_IN_SERVICE, srpc->state); EXPECT_EQ(0, srpc->peer->num_acks); @@ -605,76 +1211,141 @@ TEST_F(homa_plumbing, homa_recvmsg__server_normal_completion) } TEST_F(homa_plumbing, homa_recvmsg__delete_server_rpc_after_error) { - EXPECT_EQ(0, -homa_pool_init(&self->hsk.buffer_pool, &self->homa, - (char *) 0x1000000, 100*HOMA_BPAGE_SIZE)); struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 200); + self->server_id, 100, 200); + EXPECT_NE(NULL, srpc); srpc->error = -ENOMEM; EXPECT_EQ(ENOMEM, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(self->server_id, self->recvmsg_args.id); EXPECT_EQ(RPC_DEAD, srpc->state); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } +TEST_F(homa_plumbing, homa_recvmsg__reap_because_of_SOCK_NOSPACE) +{ + /* Make the tx message long enough that it takes multiple reap + * passes (to ensure homa_rpc_reap was called with reap_all==true). + */ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, + self->client_ip, self->server_ip, self->server_port, + self->client_id, 20000, 2000); + + EXPECT_NE(NULL, crpc); + EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); + EXPECT_TRUE(refcount_read(&self->hsk.sock.sk_wmem_alloc) > 20000); + + set_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags); + EXPECT_EQ(2000, homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, + 0, 0, &self->recvmsg_hdr.msg_namelen)); + EXPECT_EQ(1, refcount_read(&self->hsk.sock.sk_wmem_alloc)); + EXPECT_EQ(0, self->hsk.dead_skbs); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->reaper_calls)); +} TEST_F(homa_plumbing, homa_recvmsg__error_copying_out_args) { - EXPECT_EQ(0, -homa_pool_init(&self->hsk.buffer_pool, &self->homa, - (char *) 0x1000000, 100*HOMA_BPAGE_SIZE)); struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 100, 2000); + EXPECT_NE(NULL, crpc); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); mock_copy_to_user_errors = 1; EXPECT_EQ(EFAULT, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("couldn't update homa_recvmsg_args argument to recvmsg: read-only?", + self->hsk.error_msg); EXPECT_EQ(0, self->recvmsg_args.id); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } TEST_F(homa_plumbing, homa_recvmsg__copy_back_args_even_after_error) { - EXPECT_EQ(0, -homa_pool_init(&self->hsk.buffer_pool, &self->homa, - (char *) 0x1000000, 100*HOMA_BPAGE_SIZE)); - EXPECT_EQ(0, -homa_pool_get_pages(&self->hsk.buffer_pool, 2, + EXPECT_EQ(0, -homa_pool_get_pages(self->hsk.buffer_pool, 2, self->recvmsg_args.bpage_offsets, 0)); - EXPECT_EQ(1, atomic_read(&self->hsk.buffer_pool.descriptors[0].refs)); - EXPECT_EQ(1, atomic_read(&self->hsk.buffer_pool.descriptors[1].refs)); + EXPECT_EQ(1, atomic_read(&self->hsk.buffer_pool->descriptors[0].refs)); + EXPECT_EQ(1, atomic_read(&self->hsk.buffer_pool->descriptors[1].refs)); self->recvmsg_args.num_bpages = 2; self->recvmsg_args.bpage_offsets[0] = 0; self->recvmsg_args.bpage_offsets[1] = HOMA_BPAGE_SIZE; EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(0, self->recvmsg_args.num_bpages); } TEST_F(homa_plumbing, homa_softirq__basics) { struct sk_buff *skb; - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); + + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); + homa_softirq(skb); + EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); +} +TEST_F(homa_plumbing, homa_softirq__cant_pull_header) +{ + struct sk_buff *skb; + + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); + skb->data_len = skb->len - 20; + homa_softirq(skb); + EXPECT_STREQ("pskb discard", unit_log_get()); +} +TEST_F(homa_plumbing, homa_softirq__remove_extra_headers) +{ + struct sk_buff *skb; + + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); + __skb_push(skb, 10); homa_softirq(skb); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); } -TEST_F(homa_plumbing, homa_softirq__reorder_incoming_packets) +TEST_F(homa_plumbing, homa_softirq__packet_too_short) +{ + struct sk_buff *skb; + struct homa_ack_hdr h; + + h.common.type = ACK; + skb = mock_skb_alloc(self->client_ip, &h.common, 0, 0); + skb->len -= 1; + homa_softirq(skb); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(1, homa_metrics_per_cpu()->short_packets); +#endif /* See strip.py */ +} +TEST_F(homa_plumbing, homa_softirq__bogus_packet_type) +{ + struct sk_buff *skb; + + self->data.common.type = MAX_OP + 1; + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); + homa_softirq(skb); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(1, homa_metrics_per_cpu()->short_packets); +#endif /* See strip.py */ +} +TEST_F(homa_plumbing, homa_softirq__process_short_messages_first) { struct sk_buff *skb, *skb2, *skb3, *skb4; self->data.common.sender_id = cpu_to_be64(2000); self->data.message_length = htonl(2000); - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); - self->data.common.sender_id = cpu_to_be64(200); - self->data.message_length = htonl(200); - skb2 = mock_skb_new(self->client_ip, &self->data.common, 200, 0); + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); self->data.common.sender_id = cpu_to_be64(300); self->data.message_length = htonl(300); - skb3 = mock_skb_new(self->client_ip, &self->data.common, 300, 0); + skb2 = mock_skb_alloc(self->client_ip, &self->data.common, 300, 0); + self->data.common.sender_id = cpu_to_be64(200); + self->data.message_length = htonl(1600); + self->data.seg.offset = htonl(1400); + skb3 = mock_skb_alloc(self->client_ip, &self->data.common, 200, 0); self->data.common.sender_id = cpu_to_be64(5000); self->data.message_length = htonl(5000); - skb4 = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); + self->data.seg.offset = 0; + skb4 = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); skb_shinfo(skb)->frag_list = skb2; skb2->next = skb3; skb3->next = skb4; @@ -682,32 +1353,28 @@ TEST_F(homa_plumbing, homa_softirq__reorder_incoming_packets) homa_softirq(skb); unit_log_clear(); unit_log_active_ids(&self->hsk); - EXPECT_STREQ("201 301 2001 5001", unit_log_get()); + EXPECT_STREQ("301 2001 201 5001", unit_log_get()); } -TEST_F(homa_plumbing, homa_softirq__reorder_short_packet_at_front) +TEST_F(homa_plumbing, homa_softirq__process_control_first) { - struct sk_buff *skb, *skb2, *skb3, *skb4; + struct homa_common_hdr unknown = { + .sport = htons(self->client_port), + .dport = htons(self->server_port), + .type = RPC_UNKNOWN, + .sender_id = cpu_to_be64(self->client_id) + }; + struct sk_buff *skb, *skb2; + + self->data.common.sender_id = cpu_to_be64(2000); + self->data.message_length = htonl(2000); + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); + skb2 = mock_skb_alloc(self->client_ip, &unknown, 0, 0); - self->data.common.sender_id = cpu_to_be64(200); - self->data.message_length = htonl(200); - skb = mock_skb_new(self->client_ip, &self->data.common, 200, 0); - self->data.common.sender_id = cpu_to_be64(4000); - self->data.message_length = htonl(4000); - skb2 = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); - self->data.common.sender_id = cpu_to_be64(300); - self->data.message_length = htonl(300); - skb3 = mock_skb_new(self->client_ip, &self->data.common, 300, 0); - self->data.common.sender_id = cpu_to_be64(5000); - self->data.message_length = htonl(5000); - skb4 = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); skb_shinfo(skb)->frag_list = skb2; - skb2->next = skb3; - skb3->next = skb4; - skb4->next = NULL; - homa_softirq(skb); + skb2->next = NULL; unit_log_clear(); - unit_log_active_ids(&self->hsk); - EXPECT_STREQ("201 301 4001 5001", unit_log_get()); + homa_softirq(skb); + EXPECT_SUBSTR("homa_softirq shortcut type 0x13", unit_log_get()); } TEST_F(homa_plumbing, homa_softirq__nothing_to_reorder) { @@ -715,13 +1382,13 @@ TEST_F(homa_plumbing, homa_softirq__nothing_to_reorder) self->data.common.sender_id = cpu_to_be64(2000); self->data.message_length = htonl(2000); - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); self->data.common.sender_id = cpu_to_be64(3000); self->data.message_length = htonl(3000); - skb2 = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); + skb2 = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); self->data.common.sender_id = cpu_to_be64(5000); self->data.message_length = htonl(5000); - skb3 = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); + skb3 = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); skb_shinfo(skb)->frag_list = skb2; skb2->next = skb3; skb3->next = NULL; @@ -730,168 +1397,199 @@ TEST_F(homa_plumbing, homa_softirq__nothing_to_reorder) unit_log_active_ids(&self->hsk); EXPECT_STREQ("2001 3001 5001", unit_log_get()); } -TEST_F(homa_plumbing, homa_softirq__cant_pull_header) -{ - struct sk_buff *skb; - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); - skb->data_len = skb->len - 20; - homa_softirq(skb); - EXPECT_STREQ("pskb discard", unit_log_get()); -} -TEST_F(homa_plumbing, homa_softirq__remove_extra_headers) -{ - struct sk_buff *skb; - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); - __skb_push(skb, 10); - homa_softirq(skb); - EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); -} -TEST_F(homa_plumbing, homa_softirq__packet_too_short) +TEST_F(homa_plumbing, homa_softirq__per_rpc_batching) { - struct sk_buff *skb; - struct ack_header h; - h.common.type = ACK; - skb = mock_skb_new(self->client_ip, &h.common, 0, 0); - skb->len -= 1; - homa_softirq(skb); - EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.short_packets); -} -TEST_F(homa_plumbing, homa_softirq__bogus_packet_type) -{ - struct sk_buff *skb; - self->data.common.type = BOGUS; - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); + struct sk_buff *skb, *tail; + + self->data.common.sender_id = cpu_to_be64(2000); + self->data.message_length = htonl(10000); + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); + tail = skb; + + self->data.common.sender_id = cpu_to_be64(2002); + tail->next = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); + tail = tail->next; + + self->data.common.sender_id = cpu_to_be64(2004); + tail->next = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); + tail = tail->next; + + self->data.common.sender_id = cpu_to_be64(2002); + self->data.seg.offset = htonl(1400); + tail->next = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); + tail = tail->next; + + self->data.common.sender_id = cpu_to_be64(2004); + tail->next = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); + tail = tail->next; + + self->data.common.sender_id = cpu_to_be64(2002); + self->data.seg.offset = htonl(4200); + tail->next = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); + tail = tail->next; + + self->data.common.sender_id = cpu_to_be64(2002); + self->data.seg.offset = htonl(2800); + tail->next = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); + tail = tail->next; + + self->data.common.sender_id = cpu_to_be64(2004); + self->data.seg.offset = htonl(5600); + tail->next = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); + tail = tail->next; + + self->data.common.sender_id = cpu_to_be64(2002); + self->data.seg.offset = htonl(7000); + tail->next = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); + tail = tail->next; + + skb_shinfo(skb)->frag_list = skb->next; + skb->next = NULL; + unit_log_clear(); homa_softirq(skb); - EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.short_packets); + EXPECT_STREQ("id 2001, offsets 0; " + "sk->sk_data_ready invoked; " + "id 2003, offsets 0 1400 4200 2800 7000; " + "sk->sk_data_ready invoked; " + "id 2005, offsets 0 1400 5600; " + "sk->sk_data_ready invoked", + unit_log_get()); } -TEST_F(homa_plumbing, homa_softirq__unknown_socket_ipv4) + +TEST_F(homa_plumbing, homa_err_handler_v4__port_unreachable) { - struct sk_buff *skb; - self->data.common.dport = htons(100); + struct homa_rpc *crpc; + struct icmphdr *icmph; + struct sk_buff *icmp, *failed; - // Make sure the test uses IPv4. mock_ipv6 = false; - homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 100, 100); + ASSERT_NE(NULL, crpc); - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); - homa_softirq(skb); - EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_STREQ("icmp_send type 3, code 3", unit_log_get()); -} -TEST_F(homa_plumbing, homa_softirq__unknown_socket_ipv6) -{ - struct sk_buff *skb; - self->data.common.dport = htons(100); + failed = mock_skb_alloc(self->server_ip, &self->data.common, 100, 0); + ip_hdr(failed)->daddr = ipv6_to_ipv4(self->server_ip[0]); - // Make sure the test uses IPv6. - mock_ipv6 = true; - homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); + icmp = mock_raw_skb(self->server_ip, IPPROTO_ICMP, 1000); + icmph = skb_put(icmp, sizeof *icmph); + icmph->type = ICMP_DEST_UNREACH; + icmph->code = ICMP_PORT_UNREACH; + icmp->data = skb_tail_pointer(icmp); + memcpy(skb_put(icmp, failed->len), failed->head, failed->len); - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); - homa_softirq(skb); - EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_STREQ("icmp6_send type 1, code 4", unit_log_get()); + EXPECT_EQ(0, homa_err_handler_v4(icmp, 111)); + EXPECT_EQ(ENOTCONN, -crpc->error); + + kfree_skb(icmp); + kfree_skb(failed); } -TEST_F(homa_plumbing, homa_softirq__multiple_packets_different_sockets) +TEST_F(homa_plumbing, homa_err_handler_v4__host_unreachable) { - struct sk_buff *skb, *skb2; - struct homa_sock sock2; - mock_sock_init(&sock2, &self->homa, 0); - homa_sock_bind(&self->homa.port_map, &sock2, self->server_port+1); - - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); - self->data.common.sender_id += 2; - self->data.common.dport = htons(self->server_port+1); - skb2 = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); - skb_shinfo(skb)->frag_list = skb2; - skb2->next = NULL; - homa_softirq(skb); - EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(1, unit_list_length(&sock2.active_rpcs)); - homa_sock_destroy(&sock2); + struct homa_rpc *crpc; + struct icmphdr *icmph; + struct sk_buff *icmp, *failed; + + mock_ipv6 = false; + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 100, 100); + ASSERT_NE(NULL, crpc); + + failed = mock_skb_alloc(self->server_ip, &self->data.common, 100, 0); + ip_hdr(failed)->daddr = ipv6_to_ipv4(self->server_ip[0]); + + icmp = mock_raw_skb(self->server_ip, IPPROTO_ICMP, 1000); + icmph = skb_put(icmp, sizeof *icmph); + icmph->type = ICMP_DEST_UNREACH; + icmph->code = ICMP_HOST_UNKNOWN; + icmp->data = skb_tail_pointer(icmp); + memcpy(skb_put(icmp, failed->len), failed->head, failed->len); + + EXPECT_EQ(0, homa_err_handler_v4(icmp, 111)); + EXPECT_EQ(EHOSTUNREACH, -crpc->error); + + kfree_skb(icmp); + kfree_skb(failed); } -TEST_F(homa_plumbing, homa_softirq__multiple_packets_same_socket) + +TEST_F(homa_plumbing, homa_err_handler_v6__port_unreachable) { - struct sk_buff *skb, *skb2; - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); - self->data.common.sender_id += cpu_to_be64(self->client_id + 2); - skb2 = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); - skb_shinfo(skb)->frag_list = skb2; - skb2->next = NULL; - homa_softirq(skb); - EXPECT_EQ(2, unit_list_length(&self->hsk.active_rpcs)); + struct homa_rpc *crpc; + struct sk_buff *icmp, *failed; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 100, 100); + ASSERT_NE(NULL, crpc); + + failed = mock_skb_alloc(self->server_ip, &self->data.common, 100, 0); + ipv6_hdr(failed)->daddr = self->server_ip[0]; + + icmp = mock_raw_skb(self->server_ip, IPPROTO_ICMP, 1000); + memcpy(skb_put(icmp, failed->len), skb_network_header(failed), + failed->len); + + EXPECT_EQ(0, homa_err_handler_v6(icmp, NULL, ICMPV6_DEST_UNREACH, + ICMPV6_PORT_UNREACH, 0, 111)); + EXPECT_EQ(ENOTCONN, -crpc->error); + + kfree_skb(icmp); + kfree_skb(failed); } -TEST_F(homa_plumbing, homa_softirq__update_total_incoming) +TEST_F(homa_plumbing, homa_err_handler_v6__protocol_not_supported) { - struct sk_buff *skb, *skb2; + struct homa_rpc *crpc; + struct sk_buff *icmp, *failed; - self->data.seg.segment_length = htonl(1400); - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); - self->data.seg.offset = htonl(1400); - skb2 = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); - skb_shinfo(skb2)->frag_list = skb; - skb->next = NULL; - homa_softirq(skb2); - unit_log_clear(); - unit_log_active_ids(&self->hsk); - EXPECT_STREQ("1235", unit_log_get()); - EXPECT_EQ(7200, atomic_read(&self->homa.total_incoming)); + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 100, 100); + ASSERT_NE(NULL, crpc); + + failed = mock_skb_alloc(self->server_ip, &self->data.common, 100, 0); + ipv6_hdr(failed)->daddr = self->server_ip[0]; + + icmp = mock_raw_skb(self->server_ip, IPPROTO_ICMP, 1000); + memcpy(skb_put(icmp, failed->len), skb_network_header(failed), + failed->len); + + EXPECT_EQ(0, homa_err_handler_v6(icmp, NULL, ICMPV6_PARAMPROB, + ICMPV6_UNK_NEXTHDR, 0, 111)); + EXPECT_EQ(EPROTONOSUPPORT, -crpc->error); + + kfree_skb(icmp); + kfree_skb(failed); } -TEST_F(homa_plumbing, homa_metrics_open) +TEST_F(homa_plumbing, homa_poll__no_tx_buffer_space) { - EXPECT_EQ(0, homa_metrics_open(NULL, NULL)); - EXPECT_NE(NULL, self->homa.metrics); + struct socket sock = {.sk = &self->hsk.sock}; - strcpy(self->homa.metrics, "12345"); - EXPECT_EQ(0, homa_metrics_open(NULL, NULL)); - EXPECT_EQ(5, strlen(self->homa.metrics)); - EXPECT_EQ(2, self->homa.metrics_active_opens); + self->hsk.sock.sk_sndbuf = 0; + EXPECT_EQ(0, homa_poll(NULL, &sock, NULL)); + EXPECT_EQ(1, test_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags)); } -TEST_F(homa_plumbing, homa_metrics_read__basics) +TEST_F(homa_plumbing, homa_poll__not_readable) { - char buffer[1000]; - loff_t offset = 10; - self->homa.metrics = kmalloc(100, GFP_KERNEL); - self->homa.metrics_capacity = 100; - strcpy(self->homa.metrics, "0123456789abcdefghijklmnop"); - self->homa.metrics_length = 26; - EXPECT_EQ(5, homa_metrics_read(NULL, buffer, 5, &offset)); - EXPECT_SUBSTR("_copy_to_user copied 5 bytes", unit_log_get()); - EXPECT_EQ(15, offset); + struct socket sock = {.sk = &self->hsk.sock}; - unit_log_clear(); - EXPECT_EQ(11, homa_metrics_read(NULL, buffer, 1000, &offset)); - EXPECT_SUBSTR("_copy_to_user copied 11 bytes", unit_log_get()); - EXPECT_EQ(26, offset); - - unit_log_clear(); - EXPECT_EQ(0, homa_metrics_read(NULL, buffer, 1000, &offset)); - EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(26, offset); -} -TEST_F(homa_plumbing, homa_metrics_read__error_copying_to_user) -{ - char buffer[1000]; - loff_t offset = 10; - self->homa.metrics = kmalloc(100, GFP_KERNEL); - self->homa.metrics_capacity = 100; - strcpy(self->homa.metrics, "0123456789abcdefghijklmnop"); - self->homa.metrics_length = 26; - mock_copy_to_user_errors = 1; - EXPECT_EQ(EFAULT, -homa_metrics_read(NULL, buffer, 5, &offset)); + EXPECT_EQ(POLLOUT | POLLWRNORM, homa_poll(NULL, &sock, NULL)); } +TEST_F(homa_plumbing, homa_poll__socket_shutdown) +{ + struct socket sock = {.sk = &self->hsk.sock}; -TEST_F(homa_plumbing, homa_metrics_release) + unit_sock_destroy(&self->hsk); + EXPECT_EQ(POLLIN | POLLOUT | POLLWRNORM, homa_poll(NULL, &sock, NULL)); +} +TEST_F(homa_plumbing, homa_poll__socket_readable) { - self->homa.metrics_active_opens = 2; - EXPECT_EQ(0, homa_metrics_release(NULL, NULL)); - EXPECT_EQ(1, self->homa.metrics_active_opens); + struct socket sock = {.sk = &self->hsk.sock}; - EXPECT_EQ(0, homa_metrics_release(NULL, NULL)); - EXPECT_EQ(0, self->homa.metrics_active_opens); + unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 100, 100); + EXPECT_EQ(POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM, + homa_poll(NULL, &sock, NULL)); } diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index af9b461d..4bcfb249 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -1,46 +1,35 @@ -/* Copyright (c) 2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" -#include "homa_impl.h" +#include "homa_grant.h" +#include "homa_pool.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" #include "mock.h" #include "utils.h" -#define REGION_SIZE (1024*HOMA_BPAGE_SIZE) - static struct homa_pool *cur_pool; FIXTURE(homa_pool) { struct homa homa; + struct homa_net *hnet; struct homa_sock hsk; - void *buffer_region; struct in6_addr client_ip; struct in6_addr server_ip; }; FIXTURE_SETUP(homa_pool) { homa_init(&self->homa); - mock_sock_init(&self->hsk, &self->homa, 0); - self->buffer_region = (void *) 0x1000000; + self->hnet = mock_hnet(0, &self->homa); +#ifndef __STRIP__ /* See strip.py */ + self->homa.unsched_bytes = 10000; + self->homa.grant->window = 10000; +#endif /* See strip.py */ + mock_sock_init(&self->hsk, self->hnet, 0); self->client_ip = unit_get_in_addr("196.168.0.1"); self->server_ip = unit_get_in_addr("1.2.3.4"); - cur_pool = &self->hsk.buffer_pool; - ASSERT_NE(NULL, self->buffer_region); + cur_pool = self->hsk.buffer_pool; } FIXTURE_TEARDOWN(homa_pool) { @@ -55,386 +44,458 @@ static void steal_bpages_hook(char *id) return; if (!cur_pool) return; - switch (atomic_read(&cur_pool->next_scan)) { + switch (cur_pool->cores[1].next_candidate) { case 1: - atomic_set(&cur_pool->descriptors[0].refs, 1); + atomic_set(&cur_pool->descriptors[0].refs, 2); break; + case 2: + atomic_set(&cur_pool->descriptors[1].refs, 1); + cur_pool->descriptors[1].owner = 3; + cur_pool->descriptors[1].expiration = mock_clock + 1; case 3: - cur_pool->descriptors[2].owner = 0; - cur_pool->descriptors[2].expiration = mock_cycles + 1; + atomic_set(&cur_pool->descriptors[2].refs, 1); + cur_pool->descriptors[2].owner = 3; + cur_pool->descriptors[2].expiration = mock_clock - 1; + case 4: + atomic_set(&cur_pool->descriptors[3].refs, 1); } } +#ifndef __STRIP__ /* See strip.py */ +static void change_owner_hook(char *id) +{ + if (strcmp(id, "spin_lock") != 0) + return; + if (!cur_pool) + return; + cur_pool->descriptors[cur_pool->cores[smp_processor_id()] + .page_hint].owner = -1; +} +#endif /* See strip.py */ -TEST_F(homa_pool, homa_pool_init__basics) +TEST_F(homa_pool, set_bpages_needed) { - struct homa_pool *pool = &self->hsk.buffer_pool; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); - EXPECT_EQ(100, pool->num_bpages); - EXPECT_EQ(4, atomic_read(&pool->active_pages)); - EXPECT_EQ(0, atomic_read(&pool->next_scan)); - EXPECT_EQ(-1, pool->descriptors[98].owner); + struct homa_pool *pool = self->hsk.buffer_pool; + + atomic_set(&pool->free_bpages, 0); + unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2*HOMA_BPAGE_SIZE+1); + ASSERT_FALSE(list_empty(&self->hsk.waiting_for_bufs)); + EXPECT_EQ(3, pool->bpages_needed); + unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2*HOMA_BPAGE_SIZE); + EXPECT_EQ(2, pool->bpages_needed); } -TEST_F(homa_pool, homa_pool_init__region_not_page_aligned) + +TEST_F(homa_pool, homa_pool_alloc) { - EXPECT_EQ(EINVAL, -homa_pool_init(&self->hsk.buffer_pool, &self->homa, - ((char *) self->buffer_region) + 10, + struct homa_pool *pool; + + /* Success */ + pool = homa_pool_alloc(&self->hsk); + EXPECT_FALSE(IS_ERR(pool)); + EXPECT_EQ(pool->hsk, &self->hsk); + homa_pool_free(pool); + + /* Can't allocate memory. */ + mock_kmalloc_errors = 1; + pool = homa_pool_alloc(&self->hsk); + EXPECT_TRUE(IS_ERR(pool)); + EXPECT_EQ(ENOMEM, -PTR_ERR(pool)); +} + +TEST_F(homa_pool, homa_pool_set_region__region_not_page_aligned) +{ + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); + + EXPECT_EQ(EINVAL, -homa_pool_set_region(&self->hsk, + ((char *) 0x1000000) + 10, 100*HOMA_BPAGE_SIZE)); } -TEST_F(homa_pool, homa_pool_init__region_too_small) +TEST_F(homa_pool, homa_pool_set_region__region_too_small) { - EXPECT_EQ(EINVAL, -homa_pool_init(&self->hsk.buffer_pool, &self->homa, - self->buffer_region, 3*HOMA_BPAGE_SIZE)); + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); + + EXPECT_EQ(EINVAL, -homa_pool_set_region(&self->hsk, (void *) 0x1000000, + HOMA_BPAGE_SIZE)); } -TEST_F(homa_pool, homa_pool_init__cant_allocate_descriptors) +TEST_F(homa_pool, homa_pool_set_region__cant_allocate_descriptors) { + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); + mock_kmalloc_errors = 1; - EXPECT_EQ(ENOMEM, -homa_pool_init(&self->hsk.buffer_pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); + EXPECT_EQ(ENOMEM, -homa_pool_set_region(&self->hsk, (void *) 0x100000, + 100*HOMA_BPAGE_SIZE)); } -TEST_F(homa_pool, homa_pool_init__cant_allocate_core_info) +TEST_F(homa_pool, homa_pool_set_region__cant_allocate_core_info) { + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); + mock_kmalloc_errors = 2; - EXPECT_EQ(ENOMEM, -homa_pool_init(&self->hsk.buffer_pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); + EXPECT_EQ(ENOMEM, -homa_pool_set_region(&self->hsk, (void *) 0x100000, + 100*HOMA_BPAGE_SIZE)); +} +TEST_F(homa_pool, homa_pool_set_region__pool_already_has_region) +{ + EXPECT_EQ(EINVAL, -homa_pool_set_region(&self->hsk, (void *) 0x100000, + 100*HOMA_BPAGE_SIZE)); +} +TEST_F(homa_pool, homa_pool_set_region__success) +{ + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); + + EXPECT_EQ(0, -homa_pool_set_region(&self->hsk, (void *) 0x100000, + 78*HOMA_BPAGE_SIZE)); + EXPECT_EQ(78, self->hsk.buffer_pool->num_bpages); + EXPECT_EQ(-1, self->hsk.buffer_pool->descriptors[69].owner); } -TEST_F(homa_pool, homa_pool_destroy__idempotent) +TEST_F(homa_pool, homa_pool_get_rcvbuf) { - EXPECT_EQ(0, -homa_pool_init(&self->hsk.buffer_pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); - homa_pool_destroy(&self->hsk.buffer_pool); - homa_pool_destroy(&self->hsk.buffer_pool); + struct homa_rcvbuf_args args; + + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); + + EXPECT_EQ(0, -homa_pool_set_region(&self->hsk, (void *)0x40000, + 10*HOMA_BPAGE_SIZE + 1000)); + homa_pool_get_rcvbuf(self->hsk.buffer_pool, &args); + EXPECT_EQ(0x40000, args.start); + EXPECT_EQ(10*HOMA_BPAGE_SIZE, args.length); } TEST_F(homa_pool, homa_pool_get_pages__basics) { - struct homa_pool *pool = &self->hsk.buffer_pool; - __u32 pages[10]; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); + struct homa_pool *pool = self->hsk.buffer_pool; + u32 pages[10]; + EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); EXPECT_EQ(0, pages[0]); EXPECT_EQ(1, pages[1]); EXPECT_EQ(1, atomic_read(&pool->descriptors[1].refs)); EXPECT_EQ(-1, pool->descriptors[1].owner); - EXPECT_EQ(2, atomic_read(&pool->next_scan)); - EXPECT_EQ(2, atomic_read(&pool->free_bpages_found)); -} -TEST_F(homa_pool, homa_pool_get_pages__no_buffer_space) -{ - struct homa_pool *pool = &self->hsk.buffer_pool; - __u32 pages[10]; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); - atomic_set(&pool->active_pages, pool->num_bpages); - atomic_set(&pool->next_scan, pool->num_bpages); - atomic_set(&pool->free_bpages_found, 0); - EXPECT_EQ(-1, homa_pool_get_pages(pool, 2, pages, 0)); + EXPECT_EQ(2, pool->cores[smp_processor_id()].next_candidate); + EXPECT_EQ(98, atomic_read(&pool->free_bpages)); } -TEST_F(homa_pool, homa_pool_get_pages__grow_active_pool) +TEST_F(homa_pool, homa_pool_get_pages__not_enough_space) { - struct homa_pool *pool = &self->hsk.buffer_pool; - __u32 pages[10]; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); - atomic_set(&pool->active_pages, 5); - atomic_set(&pool->next_scan, 5); - atomic_set(&pool->free_bpages_found, 1); + struct homa_pool *pool = self->hsk.buffer_pool; + u32 pages[10]; + + atomic_set(&pool->free_bpages, 1); + EXPECT_EQ(-1, homa_pool_get_pages(pool, 2, pages, 0)); + atomic_set(&pool->free_bpages, 2); EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); - EXPECT_EQ(5, pages[0]); - EXPECT_EQ(6, pages[1]); - EXPECT_EQ(7, atomic_read(&pool->active_pages)); -} -TEST_F(homa_pool, homa_pool_get_pages__grow_fails_pool_max_size) -{ - struct homa_pool *pool = &self->hsk.buffer_pool; - __u32 pages[10]; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); - pool->num_bpages = 5; - atomic_set(&pool->active_pages, 5); - atomic_set(&pool->next_scan, 5); - atomic_set(&pool->free_bpages_found, 1); - EXPECT_EQ(0, -homa_pool_get_pages(pool, 1, pages, 0)); - EXPECT_EQ(0, pages[0]); - EXPECT_EQ(5, atomic_read(&pool->active_pages)); } -TEST_F(homa_pool, homa_pool_get_pages__shrink_active_pool) +TEST_F(homa_pool, homa_pool_get_pages__set_limit) { - struct homa_pool *pool = &self->hsk.buffer_pool; - __u32 pages[10]; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); - atomic_set(&pool->active_pages, 20); - atomic_set(&pool->next_scan, 22); - atomic_set(&pool->free_bpages_found, 11); + struct homa_pool *pool = self->hsk.buffer_pool; + u32 pages[10]; + + atomic_set(&pool->free_bpages, 62); + pool->cores[smp_processor_id()].next_candidate = 49; EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); - EXPECT_EQ(0, pages[0]); - EXPECT_EQ(1, pages[1]); - EXPECT_EQ(18, atomic_read(&pool->active_pages)); + EXPECT_EQ(49, pages[0]); + EXPECT_EQ(0, pages[1]); } -TEST_F(homa_pool, homa_pool_get_pages__dont_shrink_below_MIN_ACTIVE) +TEST_F(homa_pool, homa_pool_get_pages__set_limit_with_MIN_EXTRA) { - struct homa_pool *pool = &self->hsk.buffer_pool; - __u32 pages[10]; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); - atomic_set(&pool->active_pages, 4); - atomic_set(&pool->next_scan, 4); - atomic_set(&pool->free_bpages_found, 3); - EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); - EXPECT_EQ(0, pages[0]); - EXPECT_EQ(1, pages[1]); - EXPECT_EQ(4, atomic_read(&pool->active_pages)); -} -TEST_F(homa_pool, homa_pool_get_pages__basic_wraparound) -{ - struct homa_pool *pool = &self->hsk.buffer_pool; - __u32 pages[10]; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); - atomic_set(&pool->active_pages, 10); - atomic_set(&pool->next_scan, 10); - atomic_set(&pool->free_bpages_found, 3); + struct homa_pool *pool = self->hsk.buffer_pool; + u32 pages[10]; + + atomic_set(&pool->free_bpages, 92); + pool->cores[smp_processor_id()].next_candidate = 13; EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); - EXPECT_EQ(0, pages[0]); - EXPECT_EQ(1, pages[1]); - EXPECT_EQ(10, atomic_read(&pool->active_pages)); - EXPECT_EQ(2, atomic_read(&pool->free_bpages_found)); + EXPECT_EQ(13, pages[0]); + EXPECT_EQ(0, pages[1]); } TEST_F(homa_pool, homa_pool_get_pages__skip_unusable_bpages) { - struct homa_pool *pool = &self->hsk.buffer_pool; - __u32 pages[10]; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); - atomic_set(&pool->active_pages, 10); - atomic_set(&pool->descriptors[0].refs, 1); + struct homa_pool *pool = self->hsk.buffer_pool; + u32 pages[10]; + + mock_clock = 1000; + atomic_set(&pool->descriptors[0].refs, 2); + atomic_set(&pool->descriptors[1].refs, 1); + pool->descriptors[1].owner = 3; + pool->descriptors[1].expiration = mock_clock + 1; + atomic_set(&pool->descriptors[2].refs, 1); pool->descriptors[2].owner = 3; - pool->descriptors[2].expiration = mock_cycles + 1; - mock_trylock_errors = 2; + pool->descriptors[2].expiration = mock_clock - 1; + atomic_set(&pool->descriptors[3].refs, 1); EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); - EXPECT_EQ(1, pages[0]); + EXPECT_EQ(2, pages[0]); EXPECT_EQ(4, pages[1]); } +TEST_F(homa_pool, homa_pool_get_pages__cant_lock_pages) +{ + struct homa_pool *pool = self->hsk.buffer_pool; + u32 pages[10]; + + mock_clock = 1000; + mock_trylock_errors = 3; + EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); + EXPECT_EQ(2, pages[0]); + EXPECT_EQ(3, pages[1]); +} TEST_F(homa_pool, homa_pool_get_pages__state_changes_while_locking) { - struct homa_pool *pool = &self->hsk.buffer_pool; - __u32 pages[10]; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); + struct homa_pool *pool = self->hsk.buffer_pool; + u32 pages[10]; + + mock_clock = 1000; unit_hook_register(steal_bpages_hook); EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); - EXPECT_EQ(1, pages[0]); - EXPECT_EQ(3, pages[1]); + EXPECT_EQ(2, pages[0]); + EXPECT_EQ(4, pages[1]); } TEST_F(homa_pool, homa_pool_get_pages__steal_expired_page) { - struct homa_pool *pool = &self->hsk.buffer_pool; - __u32 pages[10]; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); + struct homa_pool *pool = self->hsk.buffer_pool; + u32 pages[10]; + pool->descriptors[0].owner = 5; - mock_cycles = 5000; - pool->descriptors[0].expiration = mock_cycles - 1; + mock_clock = 5000; + pool->descriptors[0].expiration = mock_clock - 1; + atomic_set(&pool->free_bpages, 20); EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); EXPECT_EQ(0, pages[0]); EXPECT_EQ(1, pages[1]); EXPECT_EQ(-1, pool->descriptors[0].owner); + EXPECT_EQ(19, atomic_read(&pool->free_bpages)); } TEST_F(homa_pool, homa_pool_get_pages__set_owner) { - struct homa_pool *pool = &self->hsk.buffer_pool; - __u32 pages[10]; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); + struct homa_pool *pool = self->hsk.buffer_pool; + u32 pages[10]; + self->homa.bpage_lease_cycles = 1000; - mock_cycles = 5000; + mock_clock = 5000; EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 1)); EXPECT_EQ(1, pool->descriptors[pages[0]].owner); - EXPECT_EQ(mock_cycles + 1000, + EXPECT_EQ(mock_clock + 1000, pool->descriptors[pages[1]].expiration); -} -TEST_F(homa_pool, homa_pool_get_pages__storage_exhausted_after_bpages_allocated) -{ - struct homa_pool *pool = &self->hsk.buffer_pool; - __u32 pages[10], i; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); - pool->num_bpages = 5; - atomic_set(&pool->active_pages, pool->num_bpages); - for (i = 0; i < pool->num_bpages; i++) { - if ((i == 2) || (i == 3)) - continue; - atomic_inc(&pool->descriptors[i].refs); - } - EXPECT_EQ(-1, homa_pool_get_pages(pool, 3, pages, 1)); - EXPECT_EQ(1, atomic_read(&pool->descriptors[0].refs)); - EXPECT_EQ(1, atomic_read(&pool->descriptors[1].refs)); - EXPECT_EQ(0, atomic_read(&pool->descriptors[2].refs)); - EXPECT_EQ(0, atomic_read(&pool->descriptors[3].refs)); - EXPECT_EQ(1, atomic_read(&pool->descriptors[4].refs)); - EXPECT_EQ(-1, pool->descriptors[2].owner); + EXPECT_EQ(2, atomic_read(&pool->descriptors[1].refs)); } -TEST_F(homa_pool, homa_pool_allocate__basics) +TEST_F(homa_pool, homa_pool_alloc_msg__basics) { - struct homa_pool *pool = &self->hsk.buffer_pool; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 150000); - ASSERT_NE(NULL, crpc); + struct homa_pool *pool = self->hsk.buffer_pool; - EXPECT_EQ(0, homa_pool_allocate(crpc)); + ASSERT_NE(NULL, crpc); EXPECT_EQ(3, crpc->msgin.num_bpages); EXPECT_EQ(0, crpc->msgin.bpage_offsets[0]); EXPECT_EQ(-1, pool->descriptors[0].owner); EXPECT_EQ(2*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[2]); - EXPECT_EQ(2, pool->cores[cpu_number].page_hint); + EXPECT_EQ(2, pool->cores[smp_processor_id()].page_hint); EXPECT_EQ(150000 - 2*HOMA_BPAGE_SIZE, - pool->cores[cpu_number].allocated); + pool->cores[smp_processor_id()].allocated); } -TEST_F(homa_pool, homa_pool_allocate__out_of_buffer_space) +TEST_F(homa_pool, homa_pool_alloc_msg__no_buffer_pool) { - struct homa_pool *pool = &self->hsk.buffer_pool; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 5*HOMA_BPAGE_SIZE)); struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 150000); + + ASSERT_NE(NULL, crpc); + + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); + + EXPECT_EQ(ENOMEM, -homa_pool_alloc_msg(crpc)); +} +TEST_F(homa_pool, homa_pool_alloc_msg__cant_allocate_full_bpages) +{ + struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; + + atomic_set(&pool->free_bpages, 1); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 150000); ASSERT_NE(NULL, crpc); - atomic_set(&pool->descriptors[1].refs, 1); - atomic_set(&pool->descriptors[2].refs, 1); - atomic_set(&pool->descriptors[3].refs, 1); - atomic_set(&pool->descriptors[4].refs, 1); - EXPECT_EQ(1, -homa_pool_allocate(crpc)); EXPECT_EQ(0, crpc->msgin.num_bpages); + EXPECT_FALSE(list_empty(&crpc->buf_links)); + EXPECT_EQ(1, atomic_read(&pool->free_bpages)); } -TEST_F(homa_pool, homa_pool_allocate__owned_page_locked) +TEST_F(homa_pool, homa_pool_alloc_msg__no_partial_page) { - struct homa_pool *pool = &self->hsk.buffer_pool; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); - atomic_set(&pool->next_scan, 2); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 2000); + struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; + + atomic_set(&pool->free_bpages, 2); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, + 2*HOMA_BPAGE_SIZE); ASSERT_NE(NULL, crpc); - EXPECT_EQ(0, homa_pool_allocate(crpc)); - EXPECT_EQ(2, pool->cores[cpu_number].page_hint); + EXPECT_EQ(2, crpc->msgin.num_bpages); + EXPECT_EQ(0, crpc->msgin.bpage_offsets[0]); + EXPECT_EQ(HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[1]); + EXPECT_EQ(0, atomic_read(&pool->free_bpages)); +} +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_pool, homa_pool_alloc_msg__owned_page_locked_and_page_stolen) +{ + struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; + + pool->cores[smp_processor_id()].next_candidate = 2; + atomic_set(&pool->free_bpages, 40); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2000); + ASSERT_NE(NULL, crpc); + + // First allocation just sets up a partially-allocated bpage. + EXPECT_EQ(2, pool->cores[smp_processor_id()].page_hint); + + // Try a second allocation; the lock hook steals the partial bpage, + // so a new one has to be allocated. crpc->msgin.num_bpages = 0; - mock_trylock_errors = 1; - EXPECT_EQ(0, homa_pool_allocate(crpc)); + mock_trylock_errors = 1; + unit_hook_register(change_owner_hook); + EXPECT_EQ(0, homa_pool_alloc_msg(crpc)); EXPECT_EQ(1, crpc->msgin.num_bpages); EXPECT_EQ(3*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[0]); - EXPECT_EQ(3, pool->cores[cpu_number].page_hint); - EXPECT_EQ(2000, pool->cores[cpu_number].allocated); - EXPECT_EQ(1, pool->descriptors[2].owner); + EXPECT_EQ(3, pool->cores[smp_processor_id()].page_hint); + EXPECT_EQ(2000, pool->cores[smp_processor_id()].allocated); + EXPECT_EQ(1, -pool->descriptors[2].owner); EXPECT_EQ(1, pool->descriptors[3].owner); + EXPECT_EQ(38, atomic_read(&pool->free_bpages)); } -TEST_F(homa_pool, homa_pool_allocate__reuse_owned_page) +#endif /* See strip.py */ +TEST_F(homa_pool, homa_pool_alloc_msg__page_wrap_around) { - struct homa_pool *pool = &self->hsk.buffer_pool; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); - atomic_set(&pool->next_scan, 2); - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 2000); + struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; + + pool->cores[smp_processor_id()].page_hint = 2; + pool->cores[smp_processor_id()].allocated = HOMA_BPAGE_SIZE-1900; + atomic_set(&pool->descriptors[2].refs, 1); + pool->descriptors[2].owner = smp_processor_id(); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2000); + ASSERT_NE(NULL, crpc); + + EXPECT_EQ(2, pool->cores[smp_processor_id()].page_hint); + EXPECT_EQ(1, crpc->msgin.num_bpages); + EXPECT_EQ(2*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[0]); + EXPECT_EQ(2000, pool->cores[smp_processor_id()].allocated); + EXPECT_EQ(smp_processor_id(), pool->descriptors[2].owner); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(1, homa_metrics_per_cpu()->bpage_reuses); +#endif /* See strip.py */ +} +TEST_F(homa_pool, homa_pool_alloc_msg__owned_page_overflow) +{ + struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; + + pool->cores[smp_processor_id()].next_candidate = 2; + atomic_set(&pool->free_bpages, 50); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2000); + ASSERT_NE(NULL, crpc); + + EXPECT_EQ(2, pool->cores[smp_processor_id()].page_hint); + crpc->msgin.num_bpages = 0; + pool->cores[smp_processor_id()].allocated = HOMA_BPAGE_SIZE-1900; + EXPECT_EQ(0, homa_pool_alloc_msg(crpc)); + EXPECT_EQ(1, crpc->msgin.num_bpages); + EXPECT_EQ(3*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[0]); + EXPECT_EQ(3, pool->cores[smp_processor_id()].page_hint); + EXPECT_EQ(2000, pool->cores[smp_processor_id()].allocated); + EXPECT_EQ(-1, pool->descriptors[2].owner); + EXPECT_EQ(1, atomic_read(&pool->descriptors[2].refs)); + EXPECT_EQ(1, pool->descriptors[3].owner); + EXPECT_EQ(48, atomic_read(&pool->free_bpages)); +} +TEST_F(homa_pool, homa_pool_alloc_msg__reuse_owned_page) +{ + struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc1, *crpc2; + + pool->cores[smp_processor_id()].next_candidate = 2; + crpc1 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2000); ASSERT_NE(NULL, crpc1); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 100, 1000, 3000); + crpc2 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 100, 1000, 3000); ASSERT_NE(NULL, crpc2); - EXPECT_EQ(0, homa_pool_allocate(crpc1)); - EXPECT_EQ(0, homa_pool_allocate(crpc2)); EXPECT_EQ(1, crpc1->msgin.num_bpages); + EXPECT_EQ(2*HOMA_BPAGE_SIZE, crpc1->msgin.bpage_offsets[0]); EXPECT_EQ(1, crpc2->msgin.num_bpages); - EXPECT_EQ(2, atomic_read(&pool->descriptors[2].refs)); - EXPECT_EQ(2, pool->cores[cpu_number].page_hint); - EXPECT_EQ(5000, pool->cores[cpu_number].allocated); + EXPECT_EQ(2*HOMA_BPAGE_SIZE + 2000, crpc2->msgin.bpage_offsets[0]); + EXPECT_EQ(3, atomic_read(&pool->descriptors[2].refs)); + EXPECT_EQ(2, pool->cores[smp_processor_id()].page_hint); + EXPECT_EQ(5000, pool->cores[smp_processor_id()].allocated); } -TEST_F(homa_pool, homa_pool_allocate__cant_allocate_partial_bpage) +TEST_F(homa_pool, homa_pool_alloc_msg__cant_allocate_partial_bpage) { - struct homa_pool *pool = &self->hsk.buffer_pool; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); - pool->num_bpages = 5; - atomic_set(&pool->active_pages, 5); - atomic_set(&pool->next_scan, 2); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 5*HOMA_BPAGE_SIZE + 100); + struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; + + atomic_set(&pool->free_bpages, 5); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, + 5*HOMA_BPAGE_SIZE + 100); ASSERT_NE(NULL, crpc); - EXPECT_EQ(-1, homa_pool_allocate(crpc)); EXPECT_EQ(0, crpc->msgin.num_bpages); EXPECT_EQ(0, atomic_read(&pool->descriptors[0].refs)); EXPECT_EQ(0, atomic_read(&pool->descriptors[1].refs)); EXPECT_EQ(0, atomic_read(&pool->descriptors[4].refs)); + EXPECT_EQ(5, atomic_read(&pool->free_bpages)); } -TEST_F(homa_pool, homa_pool_allocate__not_enough_space_in_owned_page) +TEST_F(homa_pool, homa_pool_alloc_msg__out_of_space) { - struct homa_pool *pool = &self->hsk.buffer_pool; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); - atomic_set(&pool->next_scan, 2); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 2000); - ASSERT_NE(NULL, crpc); + struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *rpc; - EXPECT_EQ(0, homa_pool_allocate(crpc)); - EXPECT_EQ(2, pool->cores[cpu_number].page_hint); - crpc->msgin.num_bpages = 0; - pool->cores[cpu_number].allocated = HOMA_BPAGE_SIZE-1900; - EXPECT_EQ(0, homa_pool_allocate(crpc)); - EXPECT_EQ(1, crpc->msgin.num_bpages); - EXPECT_EQ(3*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[0]); - EXPECT_EQ(3, pool->cores[cpu_number].page_hint); - EXPECT_EQ(2000, pool->cores[cpu_number].allocated); - EXPECT_EQ(-1, pool->descriptors[2].owner); - EXPECT_EQ(1, pool->descriptors[3].owner); -} -TEST_F(homa_pool, homa_pool_allocate__page_wrap_around) -{ - struct homa_pool *pool = &self->hsk.buffer_pool; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 2000); - ASSERT_NE(NULL, crpc); - pool->cores[cpu_number].page_hint = 2; - pool->cores[cpu_number].allocated = HOMA_BPAGE_SIZE-1900; - pool->descriptors[2].owner = cpu_number; + /* Queue up several RPCs to make sure they are properly sorted. */ + atomic_set(&pool->free_bpages, 0); + unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2000); + unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 100, 1000, 2*HOMA_BPAGE_SIZE); + unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 102, 1000, 2000); - EXPECT_EQ(0, homa_pool_allocate(crpc)); - EXPECT_EQ(2, pool->cores[cpu_number].page_hint); - EXPECT_EQ(1, crpc->msgin.num_bpages); - EXPECT_EQ(2*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[0]); - EXPECT_EQ(2000, pool->cores[cpu_number].allocated); - EXPECT_EQ(cpu_number, pool->descriptors[2].owner); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.bpage_reuses); + ASSERT_EQ(0, atomic_read(&pool->free_bpages)); + ASSERT_FALSE(list_empty(&self->hsk.waiting_for_bufs)); + rpc = list_first_entry(&self->hsk.waiting_for_bufs, struct homa_rpc, + buf_links); + EXPECT_EQ(98, rpc->id); + ASSERT_FALSE(list_is_last(&rpc->buf_links, &self->hsk.waiting_for_bufs)); + rpc = list_next_entry(rpc, buf_links); + EXPECT_EQ(102, rpc->id); + ASSERT_FALSE(list_is_last(&rpc->buf_links, &self->hsk.waiting_for_bufs)); + rpc = list_next_entry(rpc, buf_links); + EXPECT_EQ(100, rpc->id); + EXPECT_TRUE(list_is_last(&rpc->buf_links, &self->hsk.waiting_for_bufs)); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(3, homa_metrics_per_cpu()->buffer_alloc_failures); +#endif /* See strip.py */ + EXPECT_EQ(1, pool->bpages_needed); } -TEST_F(homa_pool, homa_pool_get_buffer__basics) +TEST_F(homa_pool, homa_pool_get_buffer) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; int available; void *buffer; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 150000); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 150000); ASSERT_NE(NULL, crpc); buffer = homa_pool_get_buffer(crpc, HOMA_BPAGE_SIZE + 1000, &available); EXPECT_EQ(HOMA_BPAGE_SIZE - 1000, available); @@ -443,43 +504,47 @@ TEST_F(homa_pool, homa_pool_get_buffer__basics) EXPECT_EQ((150000 & (HOMA_BPAGE_SIZE-1)) - 100, available); EXPECT_EQ((void *) (pool->region + 2*HOMA_BPAGE_SIZE + 100), buffer); } -TEST_F(homa_pool, homa_pool_get_buffer__cant_allocate_buffers) +TEST_F(homa_pool, homa_pool_get_buffer__bad_offset) { - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 150000); + struct homa_rpc *crpc; + int available; + void *buffer; + + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 150000); ASSERT_NE(NULL, crpc); - EXPECT_EQ(-1, homa_pool_allocate(crpc)); - EXPECT_EQ(0, crpc->msgin.num_bpages); + buffer = homa_pool_get_buffer(crpc, 149900, &available); + EXPECT_NE(NULL, buffer); + EXPECT_EQ(100, available); + buffer = homa_pool_get_buffer(crpc, 150000, &available); + EXPECT_EQ(NULL, buffer); + EXPECT_EQ(0, available); } -TEST_F(homa_pool, homa_pool_release_buffers) +TEST_F(homa_pool, homa_pool_release_buffers__basics) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc1, *crpc2; char *saved_region; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, - self->buffer_region, 100*HOMA_BPAGE_SIZE)); - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 150000); + crpc1 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 150000); ASSERT_NE(NULL, crpc1); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 2000); + crpc2 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2000); ASSERT_NE(NULL, crpc2); - EXPECT_EQ(0, homa_pool_allocate(crpc1)); - EXPECT_EQ(0, homa_pool_allocate(crpc2)); EXPECT_EQ(1, atomic_read(&pool->descriptors[0].refs)); EXPECT_EQ(1, atomic_read(&pool->descriptors[1].refs)); - EXPECT_EQ(2, atomic_read(&pool->descriptors[2].refs)); + EXPECT_EQ(3, atomic_read(&pool->descriptors[2].refs)); + EXPECT_EQ(97, atomic_read(&pool->free_bpages)); homa_pool_release_buffers(pool, crpc1->msgin.num_bpages, crpc1->msgin.bpage_offsets); EXPECT_EQ(0, atomic_read(&pool->descriptors[0].refs)); EXPECT_EQ(0, atomic_read(&pool->descriptors[1].refs)); - EXPECT_EQ(1, atomic_read(&pool->descriptors[2].refs)); + EXPECT_EQ(2, atomic_read(&pool->descriptors[2].refs)); + EXPECT_EQ(99, atomic_read(&pool->free_bpages)); /* Ignore requests if pool not initialized. */ saved_region = pool->region; @@ -488,4 +553,231 @@ TEST_F(homa_pool, homa_pool_release_buffers) crpc1->msgin.bpage_offsets); EXPECT_EQ(0, atomic_read(&pool->descriptors[0].refs)); pool->region = saved_region; -} \ No newline at end of file +} +TEST_F(homa_pool, homa_pool_release_buffers__bogus_offset) +{ + u32 buffer = self->hsk.buffer_pool->num_bpages << HOMA_BPAGE_SHIFT; + + EXPECT_EQ(EINVAL, -homa_pool_release_buffers(self->hsk.buffer_pool, + 1, &buffer)); +} + +TEST_F(homa_pool, homa_pool_check_waiting__basics) +{ + struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc2, *crpc3; + + /* Queue up 2 RPCs that together need a total of 5 bpages. */ + atomic_set(&pool->free_bpages, 0); + crpc2 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 3*HOMA_BPAGE_SIZE); + ASSERT_NE(NULL, crpc2); + EXPECT_EQ(0, crpc2->msgin.num_bpages); + EXPECT_EQ(3, pool->bpages_needed); + + crpc3 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2*HOMA_BPAGE_SIZE); + ASSERT_NE(NULL, crpc3); + EXPECT_EQ(0, crpc3->msgin.num_bpages); + EXPECT_EQ(2, pool->bpages_needed); + + /* Now free up the allocated pages and make sure that space can be + * allocated for the queued RPCs. + */ + unit_log_clear(); + atomic_set(&pool->free_bpages, 1); + homa_pool_check_waiting(pool); + EXPECT_EQ(0, crpc2->msgin.num_bpages); + EXPECT_EQ(0, crpc3->msgin.num_bpages); + atomic_set(&pool->free_bpages, 5); + homa_pool_check_waiting(pool); + EXPECT_EQ(3, crpc2->msgin.num_bpages); + EXPECT_EQ(2, crpc3->msgin.num_bpages); + EXPECT_EQ(INT_MAX, pool->bpages_needed); +} +TEST_F(homa_pool, homa_pool_check_waiting__pool_not_initialized) +{ + struct homa_pool pool; + + memset(&pool, 0, sizeof(pool)); + + /* Without the initialization check, this will crash. */ + homa_pool_check_waiting(&pool); +} +TEST_F(homa_pool, homa_pool_check_waiting__bpages_needed_but_no_queued_rpcs) +{ + struct homa_pool *pool = self->hsk.buffer_pool; + + pool->bpages_needed = 1; + homa_pool_check_waiting(pool); + EXPECT_EQ(100, atomic_read(&pool->free_bpages)); + EXPECT_EQ(INT_MAX, pool->bpages_needed); +} +TEST_F(homa_pool, homa_pool_check_waiting__rpc_initially_locked) +{ + struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; + + atomic_set(&pool->free_bpages, 0); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2000); + ASSERT_NE(NULL, crpc); + EXPECT_EQ(0, crpc->msgin.num_bpages); + +#ifndef __STRIP__ /* See strip.py */ + mock_trylock_errors = 0xa; +#else /* See strip.py */ + mock_trylock_errors = 0x3; +#endif /* See strip.py */ + unit_log_clear(); + atomic_set(&pool->free_bpages, 1); + homa_pool_check_waiting(pool); + EXPECT_SUBSTR("rpc lock unavailable in homa_pool_check_waiting; " + "rpc lock unavailable in homa_pool_check_waiting", + unit_log_get()); + EXPECT_EQ(1, crpc->msgin.num_bpages); + EXPECT_TRUE(list_empty(&self->hsk.waiting_for_bufs)); +} +TEST_F(homa_pool, homa_pool_check_waiting__reset_bpages_needed) +{ + struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc1, *crpc2; + + atomic_set(&pool->free_bpages, 0); + crpc1 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2000); + ASSERT_NE(NULL, crpc1); + EXPECT_EQ(0, crpc1->msgin.num_bpages); + + atomic_set(&pool->free_bpages, 0); + crpc2 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2*HOMA_BPAGE_SIZE - 1); + ASSERT_NE(NULL, crpc2); + EXPECT_EQ(0, crpc2->msgin.num_bpages); + EXPECT_EQ(1, pool->bpages_needed); + + atomic_set(&pool->free_bpages, 1); + homa_pool_check_waiting(pool); + EXPECT_EQ(1, crpc1->msgin.num_bpages); + EXPECT_EQ(0, crpc2->msgin.num_bpages); + EXPECT_EQ(2, pool->bpages_needed); +} +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_pool, homa_pool_check_waiting__wake_up_waiting_rpc) +{ + struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; + + /* Queue up an RPC that needs 2 bpages. */ + atomic_set(&pool->free_bpages, 0); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2*HOMA_BPAGE_SIZE); + ASSERT_NE(NULL, crpc); + EXPECT_EQ(0, crpc->msgin.num_bpages); + EXPECT_EQ(2, pool->bpages_needed); + EXPECT_EQ(-1, crpc->msgin.rank); + + /* Free the required pages. */ + unit_log_clear(); + atomic_set(&pool->free_bpages, 2); + homa_pool_check_waiting(pool); + EXPECT_EQ(2, crpc->msgin.num_bpages); + EXPECT_STREQ("xmit RESEND 0--2@6", unit_log_get()); + EXPECT_EQ(0, crpc->msgin.rank); +} +TEST_F(homa_pool, homa_pool_check_waiting__wake_up_waiting_rpc_only_one_priority_level) +{ + struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; + + /* Queue up an RPC that needs 2 bpages. */ + atomic_set(&pool->free_bpages, 0); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2*HOMA_BPAGE_SIZE); + ASSERT_NE(NULL, crpc); + EXPECT_EQ(0, crpc->msgin.num_bpages); + EXPECT_EQ(2, pool->bpages_needed); + self->homa.num_priorities = 1; + + /* Free the required pages. */ + unit_log_clear(); + atomic_set(&pool->free_bpages, 2); + homa_pool_check_waiting(pool); + EXPECT_EQ(2, crpc->msgin.num_bpages); + EXPECT_EQ(0, crpc->msgin.rank); + EXPECT_STREQ("xmit RESEND 0--2@0", unit_log_get()); +} +TEST_F(homa_pool, homa_pool_check_waiting__wake_up_waiting_rpc_no_need_for_grants) +{ + struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; + + atomic_set(&pool->free_bpages, 0); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 5000); + ASSERT_NE(NULL, crpc); + EXPECT_EQ(0, crpc->msgin.num_bpages); + EXPECT_EQ(1, pool->bpages_needed); + EXPECT_EQ(-1, crpc->msgin.rank); + + /* Free the required pages. */ + unit_log_clear(); + atomic_set(&pool->free_bpages, 2); + homa_pool_check_waiting(pool); + EXPECT_EQ(1, crpc->msgin.num_bpages); + EXPECT_STREQ("xmit RESEND 0--2@6", unit_log_get()); + EXPECT_EQ(-1, crpc->msgin.rank); +} +#endif /* See strip.py */ +TEST_F(homa_pool, homa_pool_check_waiting__reallocation_fails) +{ + struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; + + /* Queue up an RPC that needs 4 bpages. */ + atomic_set(&pool->free_bpages, 0); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 4*HOMA_BPAGE_SIZE); + ASSERT_NE(NULL, crpc); + EXPECT_EQ(0, crpc->msgin.num_bpages); + pool->bpages_needed = 2; + + unit_log_clear(); + atomic_set(&pool->free_bpages, 2); + homa_pool_check_waiting(pool); + EXPECT_EQ(0, crpc->msgin.num_bpages); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(4, pool->bpages_needed); +} + +TEST_F(homa_pool, homa_pool_avail_bytes__no_region) +{ + struct homa_pool *pool = homa_pool_alloc(&self->hsk); + + EXPECT_EQ(0, homa_pool_avail_bytes(pool)); + homa_pool_free(pool); +} +TEST_F(homa_pool, homa_pool_avail_bytes__a_few_pages_allocated) +{ + struct homa_pool *pool = self->hsk.buffer_pool; + u32 pages[10]; + + EXPECT_EQ(100 * HOMA_BPAGE_SIZE, homa_pool_avail_bytes(pool)); + EXPECT_EQ(0, homa_pool_get_pages(pool, 5, pages, 0)); + EXPECT_EQ(95 * HOMA_BPAGE_SIZE, homa_pool_avail_bytes(pool)); +} +TEST_F(homa_pool, homa_pool_avail_bytes__include_free_space_in_core_private_pages) +{ + struct homa_pool *pool = self->hsk.buffer_pool; + + mock_set_core(3); + EXPECT_EQ(100 * HOMA_BPAGE_SIZE, homa_pool_avail_bytes(pool)); + unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2000); + EXPECT_EQ(100 * HOMA_BPAGE_SIZE - 2000, homa_pool_avail_bytes(pool)); + + mock_set_core(5); + unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 50000); + EXPECT_EQ(100 * HOMA_BPAGE_SIZE - 52000, homa_pool_avail_bytes(pool)); +} diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c new file mode 100644 index 00000000..5126f0d2 --- /dev/null +++ b/test/unit_homa_qdisc.c @@ -0,0 +1,2489 @@ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +#include "homa_impl.h" +#include "homa_pacer.h" +#include "homa_qdisc.h" +#include "homa_rpc.h" +#define KSELFTEST_NOT_MAIN 1 +#include "kselftest_harness.h" +#include "ccutils.h" +#include "mock.h" +#include "utils.h" + +#include + +/** + * new_test_skb() - Create a new skb for use in testing qdisc stuff. + * The skb will have a small data area plus homa_skb_info. + * @rpc: RPC that the packet belongs to (stored in the homa_skb_info + * for the packet). + * @saddr: Source address for packet. + * @offset: Offset of packet data within output message. + * @length: Number of bytes of message data in packet; also used as + * qdisc_skb_cb(skb)->pkt_len. + */ +static struct sk_buff *new_test_skb(struct homa_rpc *rpc, + struct in6_addr *saddr, int offset, + int length) +{ + struct homa_skb_info *info; + struct homa_data_hdr data; + struct sk_buff *skb; + + data.common = (struct homa_common_hdr){ + .sport = htons(rpc->hsk->port), + .dport = htons(rpc->dport), + .type = DATA, + .sender_id = cpu_to_be64(rpc->id) + }; + data.message_length = htonl(rpc->msgout.length); + data.seg.offset = htonl(offset); + skb = mock_skb_alloc(saddr, &data.common, + length + sizeof(struct homa_skb_info), 0); + info = homa_get_skb_info(skb); + info->rpc = rpc; + info->data_bytes = length; + info->offset = offset; + qdisc_skb_cb(skb)->pkt_len = length + 100; + return skb; +} + +/** + * init_qdisc() - Make a homa_qdisc out of a Qdisc. + * @qdisc: Qdisc to initialize for use as a homa_qdisc. + * Return: The homa_qdisc private data. + */ +struct homa_qdisc *init_qdisc(struct Qdisc *qdisc) +{ + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + return qdisc_priv(qdisc); +} + +void log_deferred(struct homa_qdisc_dev *qdev) +{ + struct homa_skb_info *info; + struct rb_node *node; + struct homa_rpc *rpc; + struct sk_buff *skb; + + for (node = rb_first_cached(&qdev->deferred_rpcs); node; + node = rb_next(node)) { + rpc = container_of(node, struct homa_rpc, qrpc.rb_node); + unit_log_printf("; ", "[id %llu, offsets", rpc->id); + skb_queue_walk(&rpc->qrpc.packets, skb) { + info = homa_get_skb_info(skb); + unit_log_printf(" ", "%d", info->offset); + } + unit_log_printf("", "]"); + } +} + +void log_tcp_deferred(struct homa_qdisc *q) +{ + struct sk_buff *skb; + + skb_queue_walk(&q->deferred_tcp, skb) { + unit_log_printf(" ", "%d", ntohl(tcp_hdr(skb)->seq)); + } +} + +static struct homa_qdisc_dev *exit_hook_qdev; +static int exit_hook_count; +static void exit_hook(char *id) { + if (strcmp(id, "prepare_to_wait") != 0) + return; + if (exit_hook_count > 0) { + exit_hook_count--; + if (exit_hook_count == 0) + mock_exit_thread = true; + } +} + +static struct homa_qdisc_dev *defer_hook_qdev; +static struct sk_buff *defer_hook_skb; +static void defer_hook(char *id) +{ + if (strcmp(id, "prepare_to_wait") == 0 && defer_hook_qdev) { + homa_qdisc_defer_homa(defer_hook_qdev, defer_hook_skb); + defer_hook_qdev = NULL; + } +} + +static int create_hook_count; +static struct net_device *hook_dev; +static void qdev_create_hook(char *id) +{ + if (strcmp(id, "mutex_lock") != 0) + return; + if (create_hook_count <= 0) + return; + create_hook_count--; + if (create_hook_count == 0) + homa_qdisc_qdev_get(hook_dev); +} + +static u64 xmit_clock; +static void xmit_hook(char *id) +{ + if (strcmp(id, "pacer_xmit") != 0) + return; + if (xmit_clock == 0) + xmit_clock = mock_clock; +} + +static struct Qdisc *hook_qdisc; +static void complete_qdisc_hook(char *id) +{ + if (strcmp(id, "pacer spin") != 0) + return; + hook_qdisc->dev_queue->dql.num_completed += 1; +} + +FIXTURE(homa_qdisc) { + struct homa homa; + struct homa_net *hnet; + struct in6_addr addr; + struct in6_addr addr2; + struct net_device *dev; +#define NUM_TXQS 4 + struct netdev_queue txqs[NUM_TXQS]; + struct Qdisc *qdiscs[NUM_TXQS]; + struct ethtool_ops ethtool_ops; + struct in6_addr client_ip; + struct in6_addr server_ip; + int client_port; + int server_port; + u64 client_id; + u64 server_id; + struct homa_sock hsk; + struct homa_data_hdr data; +}; +FIXTURE_SETUP(homa_qdisc) +{ + int i; + + homa_qdisc_register(); + homa_init(&self->homa); + self->hnet = mock_hnet(0, &self->homa); + self->addr = unit_get_in_addr("1.2.3.4"); + self->addr2 = unit_get_in_addr("1.2.3.5"); + self->dev = mock_dev(0, &self->homa); + self->dev->_tx = self->txqs; + self->dev->num_tx_queues = NUM_TXQS; + self->dev->nd_net.net = mock_net_for_hnet(self->hnet); + self->dev->ethtool_ops = &self->ethtool_ops; + memset(&self->ethtool_ops, 0, sizeof(self->ethtool_ops)); + self->ethtool_ops.get_link_ksettings = mock_get_link_ksettings; + + memset(&self->txqs, 0, sizeof(self->txqs)); + memset(&self->qdiscs, 0, sizeof(self->qdiscs)); + for (i = 0; i < NUM_TXQS; i++) { + self->txqs[i].state = 0; + self->txqs[i].dev = self->dev; + self->qdiscs[i] = mock_alloc_qdisc(&self->txqs[i]); + self->txqs[i].qdisc = self->qdiscs[i]; + } + mock_net_queue.dev = self->dev; + + self->client_ip = unit_get_in_addr("196.168.0.1"); + self->server_ip = unit_get_in_addr("1.2.3.4"); + self->client_port = 40000; + self->server_port = 99; + self->client_id = 1234; + self->server_id = 1235; + mock_sock_init(&self->hsk, self->hnet, self->client_port); + + self->data.common = (struct homa_common_hdr){ + .sport = htons(1000), + .dport = htons(2000), + .type = DATA, + .sender_id = cpu_to_be64(100) + }; + self->data.message_length = htonl(10000); + + mock_clock = 10000; + unit_log_clear(); +} +FIXTURE_TEARDOWN(homa_qdisc) +{ + int i; + + for (i = 0; i < NUM_TXQS; i++) { + struct homa_qdisc *q = qdisc_priv(self->qdiscs[i]); + if (q->qdev) + homa_qdisc_destroy(self->qdiscs[i]); + kfree(self->qdiscs[i]); + } + homa_destroy(&self->homa); + homa_qdisc_unregister(); + unit_teardown(); +} + +TEST_F(homa_qdisc, homa_rcu_kfree__kmalloc_succeeds) +{ + /* Nothing to check in this test; if it fails, test infrastructure + * will detect memory alloc-free mismatches. + */ + + homa_rcu_kfree(kmalloc(100, GFP_KERNEL)); +} +TEST_F(homa_qdisc, homa_rcu_kfree__kmalloc_fails) +{ + mock_kmalloc_errors = 2; + homa_rcu_kfree(kmalloc(100, GFP_KERNEL)); + EXPECT_STREQ("homa_rcu_kfree kmalloc failed", unit_log_get()); +} + +TEST_F(homa_qdisc, homa_rcu_kfree_callback) +{ + struct homa_rcu_kfreer *freer; + + /* Any errors in freeing will be detected by test infrastructure. */ + freer = kmalloc(sizeof(*freer), GFP_KERNEL); + freer->object = kmalloc(200, GFP_KERNEL); + homa_rcu_kfree_callback(&freer->rcu_head); +} + +TEST_F(homa_qdisc, homa_qdisc_shared_alloc__success) +{ + struct homa_qdisc_shared *qshared; + + qshared = homa_qdisc_shared_alloc(); + ASSERT_FALSE(IS_ERR(qshared)); + EXPECT_EQ(0, unit_list_length(&qshared->qdevs)); + kfree(qshared); +} +TEST_F(homa_qdisc, homa_qdisc_shared_alloc__kmalloc_failure) +{ + struct homa_qdisc_shared *qshared; + + mock_kmalloc_errors = 1; + qshared = homa_qdisc_shared_alloc(); + ASSERT_TRUE(IS_ERR(qshared)); + EXPECT_EQ(ENOMEM, -PTR_ERR(qshared)); +} +TEST_F(homa_qdisc, homa_qdisc_shared_alloc__cant_register_sysctls) +{ + struct homa_qdisc_shared *qshared; + + mock_register_sysctl_errors = 1; + qshared = homa_qdisc_shared_alloc(); + ASSERT_TRUE(IS_ERR(qshared)); + EXPECT_EQ(ENOMEM, -PTR_ERR(qshared)); +} + +TEST_F(homa_qdisc, homa_qdisc_shared_free__basics) +{ + struct homa_qdisc_shared *qshared; + + /* Test infrastructure will report any inconsistencie in + * memory allocation. + */ + qshared = homa_qdisc_shared_alloc(); + homa_qdisc_shared_free(qshared); + EXPECT_STREQ("unregister_net_sysctl_table; call_rcu invoked", + unit_log_get()); +} +TEST_F(homa_qdisc, homa_qdisc_shared_free__unfreed_qdevs) +{ + struct homa_qdisc_shared *qshared, *saved_qshared; + struct homa_qdisc_dev *qdev; + + qshared = homa_qdisc_shared_alloc(); + saved_qshared = self->homa.qshared; + self->homa.qshared = qshared; + qdev = homa_qdisc_qdev_get(self->dev); + EXPECT_EQ(1, unit_list_length(&qshared->qdevs)); + self->homa.qshared = saved_qshared; + mock_printk_output[0] = 0; + homa_qdisc_shared_free(qshared); + EXPECT_STREQ("homa_qdisc_devs_free found 1 live qdevs " + "(should have been none)", mock_printk_output); + homa_qdisc_qdev_put(qdev); +} + +TEST_F(homa_qdisc, homa_qdisc_qdev_get__basics) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + EXPECT_FALSE(IS_ERR(qdev)); + EXPECT_EQ(1, refcount_read(&qdev->refs)); + EXPECT_EQ(1, unit_list_length(&self->homa.qshared->qdevs)); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_get__use_existing) +{ + struct homa_qdisc_dev *qdev, *qdev2; + + /* Arrange for the desired qdev not to be first on this list, to + * exercise list traversal. + */ + qdev = homa_qdisc_qdev_get(self->dev); + qdev2 = homa_qdisc_qdev_get(mock_dev(1, &self->homa)); + + EXPECT_FALSE(IS_ERR(qdev)); + EXPECT_EQ(2, unit_list_length(&self->homa.qshared->qdevs)); + EXPECT_EQ(1, refcount_read(&qdev->refs)); + + EXPECT_EQ(qdev, homa_qdisc_qdev_get(self->dev)); + EXPECT_EQ(2, refcount_read(&qdev->refs)); + + homa_qdisc_qdev_put(qdev2); + homa_qdisc_qdev_put(qdev); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_qdev_get__race_when_creating) +{ + struct homa_qdisc_dev *qdev; + + unit_hook_register(qdev_create_hook); + hook_dev = self->dev; + create_hook_count = 1; + unit_log_clear(); + qdev = homa_qdisc_qdev_get(self->dev); + EXPECT_FALSE(IS_ERR(qdev)); + EXPECT_EQ(1, unit_list_length(&self->homa.qshared->qdevs)); + EXPECT_EQ(2, refcount_read(&qdev->refs)); + EXPECT_SUBSTR("race in homa_qdisc_qdev_get", unit_log_get()); + + homa_qdisc_qdev_put(qdev); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_qdev_get__kmalloc_failure) +{ + struct homa_qdisc_dev *qdev; + + mock_kmalloc_errors = 1; + qdev = homa_qdisc_qdev_get(self->dev); + EXPECT_TRUE(IS_ERR(qdev)); + EXPECT_EQ(ENOMEM, -PTR_ERR(qdev)); +} +TEST_F(homa_qdisc, homa_qdisc_qdev_get__cant_create_thread) +{ + struct homa_qdisc_dev *qdev; + + mock_kthread_create_errors = 1; + qdev = homa_qdisc_qdev_get(self->dev); + EXPECT_TRUE(IS_ERR(qdev)); + EXPECT_EQ(EACCES, -PTR_ERR(qdev)); +} + +TEST_F(homa_qdisc, homa_qdisc_qdev_put) +{ + struct homa_qdisc_dev *qdev1, *qdev2, *qdev3; + + qdev1 = homa_qdisc_qdev_get(self->dev); + EXPECT_FALSE(IS_ERR(qdev1)); + qdev2 = homa_qdisc_qdev_get(mock_dev(1, &self->homa)); + EXPECT_FALSE(IS_ERR(qdev2)); + qdev3 = homa_qdisc_qdev_get(mock_dev(2, &self->homa)); + EXPECT_FALSE(IS_ERR(qdev3)); + + EXPECT_EQ(qdev2, homa_qdisc_qdev_get(mock_dev(1, &self->homa))); + EXPECT_EQ(2, refcount_read(&qdev2->refs)); + + /* First call: refcount doesn't hit zero. */ + homa_qdisc_qdev_put(qdev2); + EXPECT_EQ(1, refcount_read(&qdev2->refs)); + EXPECT_EQ(3, unit_list_length(&self->homa.qshared->qdevs)); + + /* Second call: refcount hits zero. */ + homa_qdisc_qdev_put(qdev2); + EXPECT_EQ(2, unit_list_length(&self->homa.qshared->qdevs)); + + homa_qdisc_qdev_put(qdev3); + homa_qdisc_qdev_put(qdev1); + EXPECT_EQ(0, unit_list_length(&self->homa.qshared->qdevs)); +} + +TEST_F(homa_qdisc, homa_qdisc_dev_callback) +{ + struct homa_rpc *srpc1, *srpc2; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 1000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc2, &self->addr, 2000, 1500)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1237, offsets 2000]; [id 1235, offsets 1000]", + unit_log_get()); + + /* If skbs aren't freed, test infrastructure will complain. */ + homa_qdisc_qdev_put(qdev); + EXPECT_EQ(0, unit_list_length(&self->homa.qshared->qdevs)); +} + +TEST_F(homa_qdisc, homa_qdisc_init__basics) +{ + struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct homa_qdisc_dev *qdev; + struct homa_qdisc *q; + + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + qdev = list_first_or_null_rcu(&self->homa.qshared->qdevs, + struct homa_qdisc_dev, links); + ASSERT_NE(NULL, qdev); + EXPECT_EQ(1, refcount_read(&qdev->refs)); + EXPECT_EQ(10000, qdev->link_mbps); + EXPECT_EQ(10240, qdisc->limit); + q = qdisc_priv(qdisc); + EXPECT_EQ(-1, q->ix); + homa_qdisc_destroy(qdisc); + kfree(qdisc); +} +TEST_F(homa_qdisc, homa_qdisc_init__cant_create_new_qdisc_dev) +{ + struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + + mock_kmalloc_errors = 1; + EXPECT_EQ(ENOMEM, -homa_qdisc_init(qdisc, NULL, NULL)); + EXPECT_EQ(0, unit_list_length(&self->homa.qshared->qdevs)); + kfree(qdisc); +} + +TEST_F(homa_qdisc, homa_qdisc_destroy) +{ + struct Qdisc *qdisc, *qdisc2; + struct homa_qdisc_dev *qdev; + struct homa_qdisc *q, *q2; + + qdisc = mock_alloc_qdisc(&mock_net_queue); + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + q = qdisc_priv(qdisc); + q->ix = 3; + qdisc2 = mock_alloc_qdisc(&mock_net_queue); + EXPECT_EQ(0, homa_qdisc_init(qdisc2, NULL, NULL)); + q2 = qdisc_priv(qdisc2); + q2->ix = 4; + qdev = list_first_or_null_rcu(&self->homa.qshared->qdevs, + struct homa_qdisc_dev, links); + EXPECT_NE(NULL, qdev); + EXPECT_EQ(2, refcount_read(&qdev->refs)); + + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1000)); + homa_qdisc_defer_tcp(q2, mock_tcp_skb(&self->addr, 6000, 1100)); + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 7000, 1100)); + EXPECT_EQ(2, unit_list_length(&qdev->deferred_qdiscs)); + qdev->congested_qdisc = q2; + + homa_qdisc_destroy(qdisc); + EXPECT_EQ(1, refcount_read(&qdev->refs)); + EXPECT_EQ(1, unit_list_length(&qdev->deferred_qdiscs)); + EXPECT_EQ(q2, qdev->congested_qdisc); + + homa_qdisc_destroy(qdisc2); + EXPECT_EQ(0, unit_list_length(&self->homa.qshared->qdevs)); + EXPECT_EQ(NULL, qdev->congested_qdisc); + kfree(qdisc); + kfree(qdisc2); +} + +TEST_F(homa_qdisc, homa_qdisc_enqueue__mark_congested_queue) +{ + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); + struct sk_buff *skb, *to_free; + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 200); + ASSERT_NE(NULL, srpc); + + q->qdisc->dev_queue->dql.num_queued = q->qdev->max_nic_queue_bytes + 1; + skb = new_test_skb(srpc, &self->addr, 0, 200); + to_free = NULL; + unit_log_clear(); + + EXPECT_EQ(NULL, q->qdev->congested_qdisc); + EXPECT_EQ(NET_XMIT_SUCCESS, + homa_qdisc_enqueue(skb, q->qdisc, &to_free)); + EXPECT_EQ(q, q->qdev->congested_qdisc); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue__short_tcp_packet) +{ + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); + struct sk_buff *skb, *to_free; + + atomic64_set(&q->qdev->link_idle_time, 1000000); + mock_queue_index = 3; + + /* First packet is short but gets transmitted in spite of + * link_idle_time. + */ + skb = mock_tcp_skb(&self->addr, 5000, 500); + to_free = NULL; + homa_qdisc_enqueue(skb, q->qdisc, &to_free); + EXPECT_EQ(NULL, to_free); + EXPECT_EQ(0, skb_queue_len(&q->deferred_tcp)); + EXPECT_EQ(1, q->qdisc->q.qlen); + + /* Second packet also gets transmitted: previously deferred + * packet is for different flow. + */ + homa_qdisc_enqueue(mock_tcp_skb(&self->addr, 5000, 2000), q->qdisc, + &to_free); + EXPECT_EQ(NULL, to_free); + EXPECT_EQ(1, skb_queue_len(&q->deferred_tcp)); + + skb = mock_tcp_skb(&self->addr, 7000, 500); + tcp_hdr(skb)->source = 13; + to_free = NULL; + homa_qdisc_enqueue(skb, q->qdisc, &to_free); + EXPECT_EQ(NULL, to_free); + EXPECT_EQ(1, skb_queue_len(&q->deferred_tcp)); + EXPECT_EQ(2, q->qdisc->q.qlen); + + /* Thiurd packet gets differed: same flow as previously deferred + * packet. + */ + skb = mock_tcp_skb(&self->addr, 8000, 500); + to_free = NULL; + homa_qdisc_enqueue(skb, q->qdisc, &to_free); + EXPECT_EQ(NULL, to_free); + EXPECT_EQ(2, skb_queue_len(&q->deferred_tcp)); + EXPECT_EQ(2, q->qdisc->q.qlen); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_tcp_packet_because_of_congested_qdisc) +{ + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); + struct sk_buff *skb, *to_free; + + mock_queue_index = 3; + + skb = mock_tcp_skb(&self->addr, 6000, 1500); + to_free = NULL; + q->qdev->congested_qdisc = q; + homa_qdisc_enqueue(skb, q->qdisc, &to_free); + EXPECT_EQ(NULL, to_free); + EXPECT_EQ(1, skb_queue_len(&q->deferred_tcp)); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_tcp_packet_because_of_other_deferred_packets) +{ + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); + struct sk_buff *skb, *to_free; + + mock_queue_index = 3; + + /* First packet gets deferred because of congested qdisc. */ + skb = mock_tcp_skb(&self->addr, 6000, 1500); + to_free = NULL; + q->qdev->congested_qdisc = q; + homa_qdisc_enqueue(skb, q->qdisc, &to_free); + EXPECT_EQ(NULL, to_free); + EXPECT_EQ(1, skb_queue_len(&q->deferred_tcp)); + + /* Second packet gets deferred because the first packet was deferred. */ + skb = mock_tcp_skb(&self->addr, 6000, 1500); + to_free = NULL; + q->qdev->congested_qdisc = NULL; + homa_qdisc_enqueue(skb, q->qdisc, &to_free); + EXPECT_EQ(NULL, to_free); + EXPECT_EQ(2, skb_queue_len(&q->deferred_tcp)); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_tcp_packet_because_of_link_idle_time) +{ + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); + struct sk_buff *skb, *to_free; + + mock_queue_index = 3; + atomic64_set(&q->qdev->link_idle_time, 1000000); + + skb = mock_tcp_skb(&self->addr, 6000, 1500); + to_free = NULL; + homa_qdisc_enqueue(skb, q->qdisc, &to_free); + EXPECT_EQ(NULL, to_free); + EXPECT_EQ(1, skb_queue_len(&q->deferred_tcp)); + EXPECT_EQ(1000000, atomic64_read(&q->qdev->link_idle_time)); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue__short_homa_message) +{ + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); + struct sk_buff *skb, *to_free; + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 200); + ASSERT_NE(NULL, srpc); + + atomic64_set(&q->qdev->link_idle_time, 1000000); + skb = new_test_skb(srpc, &self->addr, 0, 200); + to_free = NULL; + unit_log_clear(); + + EXPECT_EQ(NET_XMIT_SUCCESS, + homa_qdisc_enqueue(skb, q->qdisc, &to_free)); + EXPECT_EQ(NULL, to_free); + EXPECT_FALSE(homa_qdisc_any_deferred(q->qdev)); + EXPECT_EQ(1, q->qdisc->q.qlen); + EXPECT_STREQ("", unit_log_get()); + EXPECT_LT(1000000, atomic64_read(&q->qdev->link_idle_time)); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue__short_final_packet_in_long_message) +{ + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); + struct sk_buff *skb, *to_free; + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 7100); + ASSERT_NE(NULL, srpc); + + atomic64_set(&q->qdev->link_idle_time, 1000000); + self->data.message_length = htonl(3000); + self->data.seg.offset = htonl(2800); + skb = new_test_skb(srpc, &self->addr, 7000, 100); + to_free = NULL; + unit_log_clear(); + + EXPECT_EQ(NET_XMIT_SUCCESS, + homa_qdisc_enqueue(skb, q->qdisc, &to_free)); + EXPECT_EQ(NULL, to_free); + EXPECT_TRUE(homa_qdisc_any_deferred(q->qdev)); + EXPECT_EQ(0, q->qdisc->q.qlen); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_homa_packet_congested_qdisc) +{ + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); + struct sk_buff *skb, *to_free; + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 7100); + ASSERT_NE(NULL, srpc); + + skb = new_test_skb(srpc, &self->addr, 0, 1500); + to_free = NULL; + unit_log_clear(); + mock_log_wakeups = 1; + q->qdev->congested_qdisc = q; + + EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, q->qdisc, + &to_free)); + EXPECT_EQ(NULL, to_free); + EXPECT_TRUE(homa_qdisc_any_deferred(q->qdev)); + log_deferred(q->qdev); + EXPECT_STREQ("wake_up; [id 1235, offsets 0]", unit_log_get()); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_homa_packet_other_packets_deferred) +{ + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); + struct sk_buff *skb, *to_free; + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 7100); + ASSERT_NE(NULL, srpc); + + /* First packet is deferred because of congested qdisc. */ + skb = new_test_skb(srpc, &self->addr, 0, 1500); + to_free = NULL; + unit_log_clear(); + mock_log_wakeups = 1; + q->qdev->congested_qdisc = q; + EXPECT_EQ(NET_XMIT_SUCCESS, + homa_qdisc_enqueue(skb, q->qdisc, &to_free)); + EXPECT_EQ(NULL, to_free); + EXPECT_TRUE(homa_qdisc_any_deferred(q->qdev)); + + /* Second packet is deferred because first packet was deferred. */ + skb = new_test_skb(srpc, &self->addr, 1500, 1500); + to_free = NULL; + unit_log_clear(); + q->qdev->congested_qdisc = NULL; + EXPECT_EQ(NET_XMIT_SUCCESS, + homa_qdisc_enqueue(skb, q->qdisc, &to_free)); + EXPECT_EQ(NULL, to_free); + log_deferred(q->qdev); + EXPECT_STREQ("[id 1235, offsets 0 1500]", unit_log_get()); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_homa_packet_nic_idle_time) +{ + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); + struct sk_buff *skb, *to_free; + struct homa_rpc *srpc; + u64 idle; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 7100); + ASSERT_NE(NULL, srpc); + + idle = mock_clock + 1 + self->homa.qshared->max_nic_est_backlog_cycles + 1; + atomic64_set(&q->qdev->link_idle_time, idle); + skb = new_test_skb(srpc, &self->addr, 0, 1500); + to_free = NULL; + unit_log_clear(); + mock_log_wakeups = 1; + + EXPECT_EQ(NET_XMIT_SUCCESS, + homa_qdisc_enqueue(skb, q->qdisc, &to_free)); + EXPECT_EQ(NULL, to_free); + EXPECT_TRUE(homa_qdisc_any_deferred(q->qdev)); + EXPECT_STREQ("wake_up", unit_log_get()); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue__drop_packet_queue_over_limit) +{ + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); + struct sk_buff *skb, *to_free; + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 7100); + ASSERT_NE(NULL, srpc); + + skb = new_test_skb(srpc, &self->addr, 0, 1500); + q->qdisc->limit = 1; + q->qdisc->q.qlen = 5; + to_free = NULL; + unit_log_clear(); + + EXPECT_EQ(NET_XMIT_DROP, homa_qdisc_enqueue(skb, q->qdisc, &to_free)); + ASSERT_NE(NULL, to_free); + EXPECT_FALSE(homa_qdisc_any_deferred(q->qdev)); + EXPECT_EQ(5, q->qdisc->q.qlen); + + kfree_skb(to_free); +} + +TEST_F(homa_qdisc, homa_qdisc_can_bypass__skb_not_tcp) +{ + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); + struct sk_buff *skb; + + /* First packet: IPv4 but not TCP. */ + mock_ipv6 = false; + skb = mock_tcp_skb(&self->addr, 4000, 100); + ip_hdr(skb)->protocol = IPPROTO_TCP + 1; + EXPECT_EQ(0, homa_qdisc_can_bypass(skb, q)); + kfree_skb(skb); + + /* Second packet: IPv6 but not TCP. */ + mock_ipv6 = true; + skb = mock_tcp_skb(&self->addr, 4000, 100); + ipv6_hdr(skb)->nexthdr = IPPROTO_TCP + 1; + EXPECT_EQ(0, homa_qdisc_can_bypass(skb, q)); + kfree_skb(skb); + + /* Third packet: not IPv4 or IPv6. */ + skb = mock_tcp_skb(&self->addr, 4000, 100); + skb->protocol = 1; + EXPECT_EQ(0, homa_qdisc_can_bypass(skb, q)); + kfree_skb(skb); + + /* Fourth packet: TCP so reordering is allowed (no packets to + * conflict with). + */ + skb = mock_tcp_skb(&self->addr, 4000, 100); + EXPECT_EQ(1, homa_qdisc_can_bypass(skb, q)); + kfree_skb(skb); +} +TEST_F(homa_qdisc, homa_qdisc_can_bypass__ack) +{ + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); + struct sk_buff *skb; + + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 4000, 100)); + + /* First packet conflicts (not an ack). */ + skb = mock_tcp_skb(&self->addr, 5000, 200); + EXPECT_EQ(0, homa_qdisc_can_bypass(skb, q)); + kfree_skb(skb); + + /* Second packet is an ack. */ + skb = mock_tcp_skb(&self->addr, 5000, 0); + EXPECT_EQ(1, homa_qdisc_can_bypass(skb, q)); + kfree_skb(skb); +} +TEST_F(homa_qdisc, homa_qdisc_can_bypass__skb2_not_tcp) +{ + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); + struct sk_buff *skb, *skb2; + + skb = mock_tcp_skb(&self->addr, 4000, 100); + + /* First attempt: IPv4 but not TCP. */ + mock_ipv6 = false; + skb2 = mock_tcp_skb(&self->addr, 5000, 200); + ip_hdr(skb2)->protocol = IPPROTO_TCP + 1; + homa_qdisc_defer_tcp(q, skb2); + EXPECT_EQ(1, homa_qdisc_can_bypass(skb, q)); + __skb_queue_purge(&q->deferred_tcp); + + /* Second packet: IPv6 but not TCP. */ + mock_ipv6 = true; + skb2 = mock_tcp_skb(&self->addr, 5000, 200); + ipv6_hdr(skb2)->nexthdr = IPPROTO_TCP + 1; + homa_qdisc_defer_tcp(q, skb2); + EXPECT_EQ(1, homa_qdisc_can_bypass(skb, q)); + __skb_queue_purge(&q->deferred_tcp); + + /* Third packet: not IPv4 or IPv6. */ + skb2 = mock_tcp_skb(&self->addr, 4000, 100); + skb2->protocol = 1; + homa_qdisc_defer_tcp(q, skb2); + EXPECT_EQ(1, homa_qdisc_can_bypass(skb, q)); + + kfree_skb(skb); +} +TEST_F(homa_qdisc, homa_qdisc_can_bypass__test_address_and_ports) +{ + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); + struct sk_buff *skb; + + mock_ipv6 = true; + skb = mock_tcp_skb(&self->addr, 4000, 100); + ipv6_hdr(skb)->daddr = self->addr2; + tcp_hdr(skb)->source = 13; + tcp_hdr(skb)->dest = 42; + homa_qdisc_defer_tcp(q, skb); + + /* First packet differs on daddr. */ + skb = mock_tcp_skb(&self->addr, 5000, 200); + ipv6_hdr(skb)->daddr = self->addr; + tcp_hdr(skb)->source = 13; + tcp_hdr(skb)->dest = 42; + EXPECT_EQ(1, homa_qdisc_can_bypass(skb, q)); + + /* Second packet differs on source port. */ + ipv6_hdr(skb)->daddr = self->addr2; + tcp_hdr(skb)->source = 12; + EXPECT_EQ(1, homa_qdisc_can_bypass(skb, q)); + + /* Third packet differs on dest port. */ + tcp_hdr(skb)->source = 13; + tcp_hdr(skb)->dest = 43; + EXPECT_EQ(1, homa_qdisc_can_bypass(skb, q)); + + /* Fourth packet conflicts. */ + tcp_hdr(skb)->dest = 42; + EXPECT_EQ(0, homa_qdisc_can_bypass(skb, q)); + + kfree_skb(skb); +} +TEST_F(homa_qdisc, homa_qdisc_can_bypass__multiple_packets_in_list) +{ + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); + struct sk_buff *skb; + + skb = mock_tcp_skb(&self->addr, 4000, 100); + tcp_hdr(skb)->source = 13; + homa_qdisc_defer_tcp(q, skb); + skb = mock_tcp_skb(&self->addr, 4000, 100); + tcp_hdr(skb)->source = 14; + homa_qdisc_defer_tcp(q, skb); + skb = mock_tcp_skb(&self->addr, 4000, 100); + tcp_hdr(skb)->source = 15; + homa_qdisc_defer_tcp(q, skb); + + /* First packet conflicts. */ + skb = mock_tcp_skb(&self->addr, 5000, 200); + tcp_hdr(skb)->source = 14; + EXPECT_EQ(0, homa_qdisc_can_bypass(skb, q)); + + /* Second packet conflicts. */ + tcp_hdr(skb)->source = 15; + EXPECT_EQ(0, homa_qdisc_can_bypass(skb, q)); + + /* Third packet doesn't conflict. */ + tcp_hdr(skb)->source = 16; + EXPECT_EQ(1, homa_qdisc_can_bypass(skb, q)); + + kfree_skb(skb); +} + +TEST_F(homa_qdisc, homa_qdisc_defer_tcp__basics) +{ + struct homa_rpc *srpc; + struct homa_qdisc *q; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 10000); + ASSERT_NE(NULL, srpc); + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); + q = qdisc_priv(self->qdiscs[2]); + q->ix = 2; + mock_queue_index = 2; + + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1500)); + EXPECT_EQ(1, unit_list_length(&q->qdev->deferred_qdiscs)); + EXPECT_EQ(1, skb_queue_len(&q->deferred_tcp)); + + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 7000, 1500)); + EXPECT_EQ(1, unit_list_length(&q->qdev->deferred_qdiscs)); + EXPECT_EQ(2, skb_queue_len(&q->deferred_tcp)); +} +TEST_F(homa_qdisc, homa_qdisc_defer_tcp__update_metrics_and_wakeup) +{ + struct homa_rpc *srpc; + struct homa_qdisc *q; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 10000); + ASSERT_NE(NULL, srpc); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); + q = qdisc_priv(self->qdiscs[2]); + q->ix = 7; + mock_queue_index = 7; + mock_log_wakeups = 1; + + /* First packet: qdev->last_defer is 0. */ + EXPECT_EQ(0, q->qdev->last_defer); + mock_clock = 5000; + unit_log_clear(); + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1500)); + EXPECT_EQ(5000, q->qdev->last_defer); + EXPECT_EQ(0, homa_metrics_per_cpu()->nic_backlog_cycles); + EXPECT_STREQ("wake_up", unit_log_get()); + + /* Second packet: qdev->last_defer != 0. */ + mock_clock = 15000; + unit_log_clear(); + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 7000, 1500)); + EXPECT_EQ(15000, q->qdev->last_defer); + EXPECT_EQ(10000, homa_metrics_per_cpu()->nic_backlog_cycles); + EXPECT_STREQ("", unit_log_get()); +} + +TEST_F(homa_qdisc, homa_qdisc_defer_homa__basics) +{ + struct homa_rpc *srpc1, *srpc2, *srpc3, *srpc4; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 4, 10000, 10000); + srpc4 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 6, 10000, 10000); + + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 5000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc2, &self->addr, 4000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc3, &self->addr, 8000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc4, &self->addr, 5000, 1500)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1239, offsets 8000]; " + "[id 1235, offsets 5000]; " + "[id 1241, offsets 5000]; " + "[id 1237, offsets 4000]", unit_log_get()); + EXPECT_EQ(5000, srpc1->qrpc.tx_left); + EXPECT_EQ(6000, srpc2->qrpc.tx_left); + EXPECT_EQ(2000, srpc3->qrpc.tx_left); + EXPECT_EQ(5000, srpc4->qrpc.tx_left); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_defer_homa__multiple_pkts_for_rpc) +{ + struct homa_rpc *srpc1, *srpc2; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 1000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc2, &self->addr, 2000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 6000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 2500, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 4000, 1500)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1237, offsets 2000]; " + "[id 1235, offsets 1000 6000 2500 4000]", + unit_log_get()); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_defer_homa__dont_update_tx_left) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc->qrpc.tx_left = 2000; + + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 5000, 500)); + EXPECT_EQ(2000, srpc->qrpc.tx_left); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_defer_homa__nic_backlog_cycles_metric) +{ + struct homa_rpc *srpc1, *srpc2; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + + mock_clock = 5000; + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 1000, 1500)); + EXPECT_EQ(5000, qdev->last_defer); + EXPECT_EQ(0, homa_metrics_per_cpu()->nic_backlog_cycles); + + mock_clock = 12000; + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc2, &self->addr, 2000, 1500)); + EXPECT_EQ(12000, qdev->last_defer); + EXPECT_EQ(7000, homa_metrics_per_cpu()->nic_backlog_cycles); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_defer_homa__wake_up_pacer) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + struct sk_buff *skb; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + + skb = new_test_skb(srpc, &self->addr, 5000, 1500); + unit_log_clear(); + mock_log_wakeups = 1; + homa_qdisc_defer_homa(qdev, skb); + EXPECT_STREQ("wake_up", unit_log_get()); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 5000]", unit_log_get()); + homa_qdisc_qdev_put(qdev); +} + +TEST_F(homa_qdisc, homa_qdisc_insert_rb__basics) +{ + struct homa_rpc *srpc1, *srpc2, *srpc3; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 4, 10000, 10000); + + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 5000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc2, &self->addr, 7000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc3, &self->addr, 3000, 1500)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1237, offsets 7000]; " + "[id 1235, offsets 5000]; " + "[id 1239, offsets 3000]", + unit_log_get()); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_insert_rb__long_left_chain) +{ + struct homa_rpc *srpc1, *srpc2, *srpc3, *srpc4; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 4, 10000, 10000); + srpc4 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 6, 10000, 10000); + + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 5000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc2, &self->addr, 6000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc3, &self->addr, 7000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc4, &self->addr, 8000, 1500)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1241, offsets 8000]; " + "[id 1239, offsets 7000]; " + "[id 1237, offsets 6000]; " + "[id 1235, offsets 5000]", + unit_log_get()); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_insert_rb__long_right_chain) +{ + struct homa_rpc *srpc1, *srpc2, *srpc3, *srpc4; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 4, 10000, 10000); + srpc4 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id +6 , 10000, 10000); + + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 5000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc2, &self->addr, 4000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc3, &self->addr, 3000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc4, &self->addr, 2000, 1500)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 5000]; " + "[id 1237, offsets 4000]; " + "[id 1239, offsets 3000]; " + "[id 1241, offsets 2000]", + unit_log_get()); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_insert_rb__update_oldest_rpc) +{ + struct homa_rpc *srpc1, *srpc2, *srpc3; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 4, 10000, 10000); + + /* First insertion: oldest_rpc currently unknown, so can't update. */ + EXPECT_EQ(NULL, qdev->oldest_rpc); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 5000, 1500)); + EXPECT_EQ(NULL, qdev->oldest_rpc); + + /* Second insertion: new RPC is older. */ + srpc1->msgout.init_time = 10000; + srpc2->msgout.init_time = 5000; + qdev->oldest_rpc = srpc1; + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc2, &self->addr, 7000, 1500)); + EXPECT_EQ(srpc2, qdev->oldest_rpc); + + /* Third insertion: new RPC is younger than oldest_rpc. */ + srpc3->msgout.init_time = 5001; + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc3, &self->addr, 3000, 1500)); + EXPECT_EQ(srpc2, qdev->oldest_rpc); + homa_qdisc_qdev_put(qdev); +} + +TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__basics) +{ + struct homa_qdisc *q; + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); + q = qdisc_priv(self->qdiscs[2]); + q->ix = 2; + mock_queue_index = 2; + self->qdiscs[2]->dev_queue->dql.num_queued = + q->qdev->max_nic_queue_bytes + 1; + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1000)); + atomic64_set(&q->qdev->link_idle_time, 20000); + + EXPECT_EQ(1100, homa_qdisc_xmit_deferred_tcp(q->qdev)); + EXPECT_EQ(1, self->qdiscs[2]->q.qlen); + EXPECT_EQ(0, unit_list_length(&q->qdev->deferred_qdiscs)); + EXPECT_LT(20000, atomic64_read(&q->qdev->link_idle_time)); + EXPECT_EQ(q, q->qdev->congested_qdisc); +} +TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__no_deferred_packets) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + unit_log_clear(); + EXPECT_EQ(0, homa_qdisc_xmit_deferred_tcp(qdev)); + EXPECT_EQ(0, self->qdiscs[2]->q.qlen); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__round_robin_between_qdiscs) +{ + struct homa_qdisc *q1, *q2, *q3; + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[1], NULL, NULL)); + q1 = qdisc_priv(self->qdiscs[1]); + q1->ix = 1; + mock_queue_index = 1; + homa_qdisc_defer_tcp(q1, mock_tcp_skb(&self->addr, 5000, 1000)); + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); + q2 = qdisc_priv(self->qdiscs[2]); + q2->ix = 2; + mock_queue_index = 2; + homa_qdisc_defer_tcp(q2, mock_tcp_skb(&self->addr, 5000, 1100)); + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + q3 = qdisc_priv(self->qdiscs[3]); + q3->ix = 3; + mock_queue_index = 3; + homa_qdisc_defer_tcp(q3, mock_tcp_skb(&self->addr, 5000, 1200)); + EXPECT_EQ(3, unit_list_length(&q3->qdev->deferred_qdiscs)); + + q2->qdev->next_qdisc = &q3->defer_links; + EXPECT_EQ(1300, homa_qdisc_xmit_deferred_tcp(q2->qdev)); + EXPECT_EQ(1, self->qdiscs[3]->q.qlen); + EXPECT_EQ(2, unit_list_length(&q3->qdev->deferred_qdiscs)); + EXPECT_EQ(1100, homa_qdisc_xmit_deferred_tcp(q2->qdev)); + EXPECT_EQ(1, self->qdiscs[1]->q.qlen); + EXPECT_EQ(1, unit_list_length(&q1->qdev->deferred_qdiscs)); + EXPECT_EQ(1200, homa_qdisc_xmit_deferred_tcp(q2->qdev)); + EXPECT_EQ(1, self->qdiscs[2]->q.qlen); + EXPECT_EQ(0, unit_list_length(&q2->qdev->deferred_qdiscs)); +} +TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__backlog_cycles_metric) +{ + struct homa_qdisc *q1; + + mock_clock = 10000; + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[1], NULL, NULL)); + q1 = qdisc_priv(self->qdiscs[1]); + q1->ix = 1; + mock_queue_index = 1; + homa_qdisc_defer_tcp(q1, mock_tcp_skb(&self->addr, 5000, 1000)); + homa_qdisc_defer_tcp(q1, mock_tcp_skb(&self->addr, 6000, 1100)); + homa_qdisc_defer_tcp(q1, mock_tcp_skb(&self->addr, 6000, 1200)); + + mock_clock = 11000; + EXPECT_EQ(1100, homa_qdisc_xmit_deferred_tcp(q1->qdev)); + EXPECT_EQ(0, homa_metrics_per_cpu()->nic_backlog_cycles); + mock_clock = 12000; + EXPECT_EQ(1200, homa_qdisc_xmit_deferred_tcp(q1->qdev)); + EXPECT_EQ(0, homa_metrics_per_cpu()->nic_backlog_cycles); + mock_clock = 13000; + EXPECT_EQ(1300, homa_qdisc_xmit_deferred_tcp(q1->qdev)); + EXPECT_EQ(3000, homa_metrics_per_cpu()->nic_backlog_cycles); + EXPECT_EQ(0, q1->qdev->last_defer); +} + +TEST_F(homa_qdisc, homa_qdisc_get_oldest__return_cached_value) +{ + struct homa_rpc *srpc1; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 5000); + + qdev->oldest_rpc = srpc1; + EXPECT_EQ(srpc1, homa_qdisc_get_oldest(qdev)); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_get_oldest__iterate_rbtree) +{ + struct homa_rpc *srpc1, *srpc2, *srpc3; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 5000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 3000); + srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 4, 10000, 10000); + + srpc1->msgout.init_time = 10000; + srpc2->msgout.init_time = 5000; + srpc3->msgout.init_time = 7000; + + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 0, 1000)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc2, &self->addr, 0, 1000)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc3, &self->addr, 0, 1000)); + + EXPECT_EQ(srpc2, homa_qdisc_get_oldest(qdev)); + EXPECT_EQ(srpc2, qdev->oldest_rpc); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_get_oldest__no_rpcs_in_rbtree) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + + EXPECT_EQ(NULL, homa_qdisc_get_oldest(qdev)); + + homa_qdisc_qdev_put(qdev); +} + +TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__no_deferred_rpcs) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + + EXPECT_EQ(NULL, homa_qdisc_get_deferred_homa(qdev)); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__basics) +{ + struct homa_rpc *srpc1; + struct homa_qdisc_dev *qdev; + struct sk_buff *skb; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc1); + + skb = new_test_skb(srpc1, &self->addr, 5000, 500); + homa_qdisc_defer_homa(qdev, skb); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 5000]", unit_log_get()); + + qdev->srpt_bytes = 200; + EXPECT_EQ(skb, homa_qdisc_get_deferred_homa(qdev)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("", unit_log_get()); + kfree_skb(skb); + EXPECT_EQ(-400, qdev->srpt_bytes); + EXPECT_EQ(4500, srpc1->qrpc.tx_left); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__choose_fifo_rpc) +{ + struct homa_rpc *srpc1, *srpc2; + struct homa_qdisc_dev *qdev; + struct sk_buff *skb; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc1); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + ASSERT_NE(NULL, srpc2); + + skb = new_test_skb(srpc1, &self->addr, 0, 900); + homa_qdisc_defer_homa(qdev, skb); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc2, &self->addr, 2000, + 900)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1237, offsets 2000]; [id 1235, offsets 0]", + unit_log_get()); + srpc1->msgout.init_time = 5000; + srpc2->msgout.init_time = 6000; + + self->homa.qshared->fifo_fraction = 200; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + qdev->srpt_bytes = -100; + + EXPECT_EQ(skb, homa_qdisc_get_deferred_homa(qdev)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1237, offsets 2000]", unit_log_get()); + kfree_skb(skb); + EXPECT_EQ(3900, qdev->srpt_bytes); + EXPECT_EQ(NULL, qdev->oldest_rpc); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__fifo_fraction_zero) +{ + struct homa_rpc *srpc1, *srpc2; + struct homa_qdisc_dev *qdev; + struct sk_buff *skb; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc1); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + ASSERT_NE(NULL, srpc2); + + homa_qdisc_defer_homa(qdev, new_test_skb(srpc1, &self->addr, 0, 900)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc2, &self->addr, 2000, + 900)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1237, offsets 2000]; [id 1235, offsets 0]", + unit_log_get()); + srpc1->msgout.init_time = 5000; + srpc2->msgout.init_time = 6000; + + self->homa.qshared->fifo_fraction = 0; + qdev->srpt_bytes = -100; + + skb = homa_qdisc_get_deferred_homa(qdev); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 0]", unit_log_get()); + kfree_skb(skb); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__multiple_packets_for_rpc) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + struct sk_buff *skb; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + + skb = new_test_skb(srpc, &self->addr, 2000, 500); + homa_qdisc_defer_homa(qdev, skb); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 3000, 500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 4000, 500)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 2000 3000 4000]", unit_log_get()); + + EXPECT_EQ(skb, homa_qdisc_get_deferred_homa(qdev)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 3000 4000]", unit_log_get()); + kfree_skb(skb); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__last_packet_for_rpc) +{ + struct homa_rpc *srpc1, *srpc2; + struct homa_qdisc_dev *qdev; + struct sk_buff *skb; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc1); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + ASSERT_NE(NULL, srpc2); + + skb = new_test_skb(srpc1, &self->addr, 5000, 500); + homa_qdisc_defer_homa(qdev, skb); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc2, &self->addr, 2000, + 500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc2, &self->addr, 3000, + 500)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 5000]; [id 1237, offsets 2000 3000]", + unit_log_get()); + qdev->oldest_rpc = srpc1; + + EXPECT_EQ(skb, homa_qdisc_get_deferred_homa(qdev)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1237, offsets 2000 3000]", unit_log_get()); + EXPECT_EQ(NULL, qdev->oldest_rpc); + kfree_skb(skb); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__update_tx_left) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 3000, 500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 4000, 500)); + srpc->qrpc.tx_left = 6000; + + /* First packet doesn't update tx_left. */ + kfree_skb(homa_qdisc_get_deferred_homa(qdev)); + EXPECT_EQ(6000, srpc->qrpc.tx_left); + + /* Second packet does update tx_left. */ + kfree_skb(homa_qdisc_get_deferred_homa(qdev)); + EXPECT_EQ(5500, srpc->qrpc.tx_left); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__reposition_rpc_in_rbtree) +{ + struct homa_rpc *srpc1, *srpc2; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc1); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + ASSERT_NE(NULL, srpc2); + + homa_qdisc_defer_homa(qdev, new_test_skb(srpc1, &self->addr, 0, 1500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc1, &self->addr, 1500, + 1500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc2, &self->addr, 1000, + 1000)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1237, offsets 1000]; [id 1235, offsets 0 1500]", + unit_log_get()); + + qdev->oldest_rpc = srpc1; + qdev->srpt_bytes = -100; + + /* First extraction: FIFO RPC must be repositioned in rbtree. */ + kfree_skb(homa_qdisc_get_deferred_homa(qdev)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 1500]; [id 1237, offsets 1000]", + unit_log_get()); + + /* Second extraction: FIFO RPC removed from tree.*/ + qdev->oldest_rpc = srpc2; + qdev->srpt_bytes = -100; + kfree_skb(homa_qdisc_get_deferred_homa(qdev)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 1500]", unit_log_get()); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__pacer_fifo_bytes_metric) +{ + struct homa_rpc *srpc1; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc1); + + homa_qdisc_defer_homa(qdev, new_test_skb(srpc1, &self->addr, 0, 1500)); + + qdev->oldest_rpc = srpc1; + qdev->srpt_bytes = -100; + kfree_skb(homa_qdisc_get_deferred_homa(qdev)); + EXPECT_EQ(1600, homa_metrics_per_cpu()->pacer_fifo_bytes); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__nic_backlog_cycles_metric) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + + mock_clock = 5000; + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 2000, 500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 3000, 500)); + EXPECT_EQ(0, homa_metrics_per_cpu()->nic_backlog_cycles); + EXPECT_EQ(5000, qdev->last_defer); + + mock_clock = 12000; + kfree_skb(homa_qdisc_get_deferred_homa(qdev)); + EXPECT_EQ(0, homa_metrics_per_cpu()->nic_backlog_cycles); + EXPECT_EQ(5000, qdev->last_defer); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + + mock_clock = 14000; + kfree_skb(homa_qdisc_get_deferred_homa(qdev)); + EXPECT_EQ(9000, homa_metrics_per_cpu()->nic_backlog_cycles); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(0, qdev->last_defer); + homa_qdisc_qdev_put(qdev); +} + +TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_homa__no_packets_available) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + EXPECT_EQ(0, homa_qdisc_xmit_deferred_homa(qdev)); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_homa__packet_available) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + struct homa_qdisc *q; + u64 link_idle; + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + q = qdisc_priv(self->qdiscs[3]); + mock_clock = 10000; + mock_queue_index = 3; + qdev = homa_qdisc_qdev_get(self->dev); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + + link_idle = atomic64_read(&qdev->link_idle_time); + self->qdiscs[3]->dev_queue->dql.num_queued = qdev->max_nic_queue_bytes + + 1; + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + + mock_clock = 11000; + EXPECT_EQ(1100, homa_qdisc_xmit_deferred_homa(qdev)); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(1, self->qdiscs[3]->q.qlen); + EXPECT_LT(link_idle, atomic64_read(&qdev->link_idle_time)); + EXPECT_EQ(q, qdev->congested_qdisc); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_homa__qdisc_not_homa) +{ + const struct Qdisc_ops *saved_ops; + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + u64 link_idle; + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + mock_clock = 10000; + mock_queue_index = 3; + qdev = homa_qdisc_qdev_get(self->dev); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + + link_idle = atomic64_read(&qdev->link_idle_time); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + + mock_clock = 11000; + saved_ops = self->qdiscs[3]->ops; + self->qdiscs[3]->ops = NULL; + EXPECT_EQ(1100, homa_qdisc_xmit_deferred_homa(qdev)); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + EXPECT_LT(link_idle, atomic64_read(&qdev->link_idle_time)); + self->qdiscs[3]->ops = saved_ops; + + homa_qdisc_qdev_put(qdev); +} + +TEST_F(homa_qdisc, homa_qdisc_free_homa) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 1000, 500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 2000, 500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 3000, 500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 4000, 500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 5000, 500)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 1000 2000 3000 4000 5000]", + unit_log_get()); + + homa_qdisc_free_homa(qdev); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("", unit_log_get()); + homa_qdisc_qdev_put(qdev); +} + +TEST_F(homa_qdisc, homa_qdisc_update_link_idle__nic_idle) +{ + struct homa_qdisc_dev qdev; + + memset(&qdev, 0, sizeof(qdev)); + qdev.cycles_per_mibyte = 1 << 20; /* 1 cycle per byte. */ + mock_clock = 1000; + + EXPECT_EQ(1, homa_qdisc_update_link_idle(&qdev, 200, 0)); + EXPECT_EQ(1200 + HOMA_ETH_FRAME_OVERHEAD, + atomic64_read(&qdev.link_idle_time)); +} +TEST_F(homa_qdisc, homa_qdisc_update_link_idle__queue_too_long) +{ + struct homa_qdisc_dev qdev; + + memset(&qdev, 0, sizeof(qdev)); + qdev.cycles_per_mibyte = 1 << 20; /* 1 cycle per byte. */ + mock_clock = 1000; + atomic64_set(&qdev.link_idle_time, 1100); + + /* First attempt: queue too long. */ + EXPECT_EQ(0, homa_qdisc_update_link_idle(&qdev, 200, 99)); + EXPECT_EQ(1100, atomic64_read(&qdev.link_idle_time)); + + /* Second attempt tolerates longer queue. */ + EXPECT_EQ(1, homa_qdisc_update_link_idle(&qdev, 200, 110)); + EXPECT_EQ(1300 + HOMA_ETH_FRAME_OVERHEAD, + atomic64_read(&qdev.link_idle_time)); +} +TEST_F(homa_qdisc, homa_qdisc_update_link_idle__ignore_queue_length) +{ + struct homa_qdisc_dev qdev; + + memset(&qdev, 0, sizeof(qdev)); + qdev.cycles_per_mibyte = 1 << 20; /* 1 cycle per byte. */ + mock_clock = 1000; + atomic64_set(&qdev.link_idle_time, 1200); + + EXPECT_EQ(1, homa_qdisc_update_link_idle(&qdev, 120, -1)); + EXPECT_EQ(1320 + HOMA_ETH_FRAME_OVERHEAD, + atomic64_read(&qdev.link_idle_time)); +} +TEST_F(homa_qdisc, homa_qdisc_update_link_idle__cmpxchg_conflicts) +{ + struct homa_qdisc_dev qdev; + + memset(&qdev, 0, sizeof(qdev)); + qdev.cycles_per_mibyte = 1 << 20; /* 1 cycle per byte. */ + mock_clock = 1000; + mock_cmpxchg_errors = 0xf; + + EXPECT_EQ(1, homa_qdisc_update_link_idle(&qdev, 200, 0)); + EXPECT_EQ(1200 + HOMA_ETH_FRAME_OVERHEAD, + atomic64_read(&qdev.link_idle_time)); + EXPECT_EQ(4, homa_metrics_per_cpu()->idle_time_conflicts); +} + +TEST_F(homa_qdisc, homa_qdisc_pacer_main) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + + /* This test checks for two things: + * (a) proper handling of deferred packets that arrive while sleeping + * (b) proper thread exit + */ + qdev = homa_qdisc_qdev_get(self->dev); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + mock_queue_index = 3; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + + exit_hook_qdev = qdev; + exit_hook_count = 10; + unit_hook_register(exit_hook); + defer_hook_qdev = qdev; + defer_hook_skb = new_test_skb(srpc, &self->addr, 1000, 500); + unit_hook_register(defer_hook); + + homa_qdisc_pacer_main(qdev); + EXPECT_EQ(1, self->qdiscs[3]->q.qlen); + EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_homa_packets); + EXPECT_EQ(0, exit_hook_count); + + homa_qdisc_qdev_put(qdev); +} + +TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_lock_unavailable) +{ + struct homa_qdisc_dev *qdev; + u64 link_idle; + struct homa_rpc *srpc; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + + link_idle = atomic64_read(&qdev->link_idle_time); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + unit_log_clear(); + + mock_spin_lock_held = 1; + homa_qdisc_pacer(qdev, false); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + EXPECT_EQ(link_idle, atomic64_read(&qdev->link_idle_time)); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_pacer__spin_until_qdisc_no_longer_congested) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + struct homa_qdisc *q; + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + mock_queue_index = 3; + q = qdisc_priv(self->qdiscs[3]); + qdev = homa_qdisc_qdev_get(self->dev); + qdev->congested_qdisc = q; + self->qdiscs[3]->dev_queue->dql.num_queued = qdev->max_nic_queue_bytes + + 10; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); + hook_qdisc = self->qdiscs[3]; + unit_hook_register(complete_qdisc_hook); + + homa_qdisc_pacer(qdev, false); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(NULL, qdev->congested_qdisc); + EXPECT_EQ(10, hook_qdisc->dev_queue->dql.num_completed); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_pacer__spin_until_link_idle) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + mock_queue_index = 3; + qdev = homa_qdisc_qdev_get(self->dev); + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); + + mock_clock = 0; + mock_clock_tick = 1000; + atomic64_set(&qdev->link_idle_time, 10000); + self->homa.qshared->max_nic_est_backlog_cycles = 3500; + unit_log_clear(); + unit_hook_register(xmit_hook); + xmit_clock = 0; + + homa_qdisc_pacer(qdev, false); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(1, self->qdiscs[3]->q.qlen); + EXPECT_EQ(7000, xmit_clock); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_pacer__return_after_one_packet) +{ + struct homa_rpc *srpc1, *srpc2; + struct homa_qdisc_dev *qdev; + struct sk_buff *skb; + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + mock_queue_index = 3; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc1); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + ASSERT_NE(NULL, srpc2); + + skb = new_test_skb(srpc1, &self->addr, 5000, 1500); + homa_qdisc_defer_homa(qdev, skb); + skb = new_test_skb(srpc2, &self->addr, 4000, 1500); + homa_qdisc_defer_homa(qdev, skb); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 5000]; [id 1237, offsets 4000]", + unit_log_get()); + + mock_clock = atomic64_read(&qdev->link_idle_time); + self->homa.qshared->max_nic_est_backlog_cycles = 100; + unit_log_clear(); + + homa_qdisc_pacer(qdev, false); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1237, offsets 4000]", unit_log_get()); + EXPECT_EQ(1, self->qdiscs[3]->q.qlen); + EXPECT_LT(mock_clock + 100, atomic64_read(&qdev->link_idle_time)); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_pacer__dont_spin) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + mock_queue_index = 3; + qdev = homa_qdisc_qdev_get(self->dev); + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); + + mock_clock = 0; + mock_clock_tick = 1000; + atomic64_set(&qdev->link_idle_time, 10000); + self->homa.qshared->max_nic_est_backlog_cycles = 3500; + unit_log_clear(); + + homa_qdisc_pacer(qdev, true); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_pacer__no_deferred_packets) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + qdev->homa_credit = -1000; + + homa_qdisc_pacer(qdev, false); + EXPECT_EQ(0, atomic64_read(&qdev->link_idle_time)); + EXPECT_EQ(-1000, qdev->homa_credit); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_pacer__xmit_homa_packet_no_tcp) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + mock_queue_index = 3; + qdev = homa_qdisc_qdev_get(self->dev); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + qdev->homa_credit = -100; + qdev->hnet->homa->qshared->homa_share = 40; + + homa_qdisc_pacer(qdev, false); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(1, self->qdiscs[3]->q.qlen); + EXPECT_EQ(-65999, qdev->homa_credit); + EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_homa_packets); + EXPECT_EQ(1100, homa_metrics_per_cpu()->pacer_homa_bytes); + EXPECT_EQ(0, homa_metrics_per_cpu()->pacer_help_bytes); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_pacer__xmit_tcp_no_homa) +{ + struct homa_qdisc_dev *qdev; + struct homa_qdisc *q; + + qdev = homa_qdisc_qdev_get(self->dev); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); + q = qdisc_priv(self->qdiscs[2]); + q->ix = 2; + mock_queue_index = 2; + + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1100)); + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1200)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + qdev->homa_credit = 1000; + qdev->hnet->homa->qshared->homa_share = 40; + + homa_qdisc_pacer(qdev, false); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(2, self->qdiscs[2]->q.qlen); + EXPECT_EQ(52000, qdev->homa_credit); + EXPECT_EQ(2, homa_metrics_per_cpu()->pacer_tcp_packets); + EXPECT_EQ(2500, homa_metrics_per_cpu()->pacer_tcp_bytes); + EXPECT_EQ(0, homa_metrics_per_cpu()->pacer_help_bytes); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_pacer__both_protocols_have_packets_choose_tcp) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + struct homa_qdisc *q; + + qdev = homa_qdisc_qdev_get(self->dev); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); + q = qdisc_priv(self->qdiscs[2]); + q->ix = 2; + mock_queue_index = 2; + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1100)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + qdev->homa_credit = -100; + qdev->hnet->homa->qshared->homa_share = 40; + + /* Arrange for the the NIC queue to exceed its limit once the next + * packet is transmitted. + */ + atomic64_set(&qdev->link_idle_time, 1000000); + qdev->hnet->homa->qshared->max_nic_est_backlog_cycles = 10000; + mock_clock = 1000000 - 10000 + 100; + + homa_qdisc_pacer(qdev, false); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(1, self->qdiscs[2]->q.qlen); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + EXPECT_EQ(40*1200 - 100, qdev->homa_credit); + EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_tcp_packets); + EXPECT_EQ(1200, homa_metrics_per_cpu()->pacer_tcp_bytes); + EXPECT_EQ(0, homa_metrics_per_cpu()->pacer_help_bytes); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_pacer__xmit_multiple_packets) +{ + struct homa_qdisc_dev *qdev; + struct homa_qdisc *q; + + qdev = homa_qdisc_qdev_get(self->dev); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); + q = qdisc_priv(self->qdiscs[2]); + q->ix = 2; + mock_queue_index = 2; + + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1100)); + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1200)); + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1300)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + qdev->hnet->homa->qshared->homa_share = 40; + qdev->hnet->homa->qshared->max_nic_est_backlog_cycles = 100000; + + homa_qdisc_pacer(qdev, false); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(3, self->qdiscs[2]->q.qlen); + EXPECT_EQ(3, homa_metrics_per_cpu()->pacer_tcp_packets); + EXPECT_EQ(3900, homa_metrics_per_cpu()->pacer_tcp_bytes); + EXPECT_EQ(0, homa_metrics_per_cpu()->pacer_help_bytes); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_help_bytes_metric) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + mock_queue_index = 3; + qdev = homa_qdisc_qdev_get(self->dev); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 800)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + unit_log_clear(); + + homa_qdisc_pacer(qdev, true); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_homa_packets); + EXPECT_EQ(900, homa_metrics_per_cpu()->pacer_homa_bytes); + EXPECT_EQ(900, homa_metrics_per_cpu()->pacer_help_bytes); + + homa_qdisc_qdev_put(qdev); +} + +TEST_F(homa_qdisc, homa_qdisc_pacer_check__enqueue_packet) +{ + struct homa_qdisc_dev *qdev, *qdev2; + struct homa_rpc *srpc; + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + mock_queue_index = 3; + + /* Create 2 qdevs to verify that homa_qdisc_pacer_check loops over + * all qdevs. + */ + qdev2 = homa_qdisc_qdev_get(mock_dev(1, &self->homa)); + qdev = homa_qdisc_qdev_get(self->dev); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + + atomic64_set(&qdev->link_idle_time, 20000); + mock_clock = 15000; + self->homa.qshared->max_nic_est_backlog_cycles = 12000; + + homa_qdisc_pacer_check(&self->homa); + EXPECT_EQ(1, self->qdiscs[3]->q.qlen); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + + homa_qdisc_qdev_put(qdev); + homa_qdisc_qdev_put(qdev2); +} +TEST_F(homa_qdisc, homa_qdisc_pacer_check__no_deferred_rpcs) +{ + struct homa_qdisc_dev *qdev, *qdev2; + + /* Create 2 qdevs to verify that homa_qdisc_pacer_check loops over + * all qdevs. + */ + qdev2 = homa_qdisc_qdev_get(mock_dev(1, &self->homa)); + qdev = homa_qdisc_qdev_get(self->dev); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + + atomic64_set(&qdev->link_idle_time, 20000); + mock_clock = 15000; + self->homa.qshared->max_nic_est_backlog_cycles = 12000; + + homa_qdisc_pacer_check(&self->homa); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + + homa_qdisc_qdev_put(qdev); + homa_qdisc_qdev_put(qdev2); +} +TEST_F(homa_qdisc, homa_qdisc_pacer_check__lag_not_long_enough) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + + atomic64_set(&qdev->link_idle_time, 20000); + mock_clock = 13000; + self->homa.qshared->max_nic_est_backlog_cycles = 12000; + + homa_qdisc_pacer_check(&self->homa); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + + homa_qdisc_qdev_put(qdev); +} + +TEST_F(homa_qdisc, homa_qdev_update_sysctl__basics) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + EXPECT_FALSE(IS_ERR(qdev)); + + self->homa.link_mbps = 25000; + mock_link_mbps = 8000; + self->homa.qshared->max_link_usage = 90; + self->homa.qshared->max_nic_queue_usecs = 50; + qdev->srpt_bytes = -1000; + homa_qdev_update_sysctl(qdev); + EXPECT_EQ(8000, qdev->link_mbps); + EXPECT_EQ(1165084, qdev->cycles_per_mibyte); + EXPECT_EQ(50000, qdev->max_nic_queue_bytes); + EXPECT_EQ(0, qdev->srpt_bytes); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdev_update_sysctl__cant_get_link_speed_from_dev) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + EXPECT_FALSE(IS_ERR(qdev)); + + self->homa.link_mbps = 16000; + mock_link_mbps = 8000; + mock_ethtool_ksettings_errors = 1; + homa_qdev_update_sysctl(qdev); + EXPECT_EQ(16000, qdev->link_mbps); + EXPECT_EQ(529583, qdev->cycles_per_mibyte); + + homa_qdisc_qdev_put(qdev); +} + +TEST_F(homa_qdisc, homa_qdisc_update_sysctl_deps__fifo_fraction) +{ + self->homa.qshared->fifo_fraction = 500; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(1<homa.qshared->fifo_weight); + + self->homa.qshared->fifo_fraction = 200; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(4 * (1 << HOMA_FIFO_WEIGHT_SHIFT), + self->homa.qshared->fifo_weight); + + self->homa.qshared->fifo_fraction = 800; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ((1 << HOMA_FIFO_WEIGHT_SHIFT) / 4, self->homa.qshared->fifo_weight); + + self->homa.qshared->fifo_fraction = 0; + self->homa.qshared->fifo_weight = -1; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(-1, self->homa.qshared->fifo_weight); +} +TEST_F(homa_qdisc, homa_qdisc_update_sysctl_deps__max_nic_est_backlog_cycles) +{ + self->homa.qshared->max_nic_est_backlog_usecs = 6; + self->homa.link_mbps = 10000; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(6000, self->homa.qshared->max_nic_est_backlog_cycles); +} +TEST_F(homa_qdisc, homa_qdisc_update_sysctl_deps__limit_homa_share) +{ + self->homa.qshared->homa_share = -1; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(0, self->homa.qshared->homa_share); + + self->homa.qshared->homa_share = 0; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(0, self->homa.qshared->homa_share); + + self->homa.qshared->homa_share = 100; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(100, self->homa.qshared->homa_share); + + self->homa.qshared->homa_share = 101; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(100, self->homa.qshared->homa_share); +} +TEST_F(homa_qdisc, homa_qdisc_update_sysctl_deps__limit_max_link_usage) +{ + self->homa.qshared->max_link_usage = 4; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(5, self->homa.qshared->max_link_usage); + + self->homa.qshared->max_link_usage = 6; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(6, self->homa.qshared->max_link_usage); + + self->homa.qshared->max_link_usage = 100; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(100, self->homa.qshared->max_link_usage); + + self->homa.qshared->max_link_usage = 101; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(100, self->homa.qshared->max_link_usage); +} +TEST_F(homa_qdisc, homa_qdisc_update_sysctl_deps__update_all_qdevs) +{ + struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct netdev_queue txq2; + struct net_device net_device2; + struct homa_qdisc *q, *q2; + struct Qdisc *qdisc2; + + /* qdisc has a net device that provides link speed; qdisc2, created + * below, has a net device that doesn't provide link speed, so it + * uses homa->link_mbps. + */ + memset(&txq2, 0, sizeof(txq2)); + memset(&net_device2, 0, sizeof(net_device2)); + txq2.dev = &net_device2; + net_device2.nd_net.net = &mock_nets[0]; + qdisc2 = mock_alloc_qdisc(&txq2); + + self->homa.link_mbps = 16000; + mock_link_mbps = 40000; + + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + EXPECT_EQ(0, homa_qdisc_init(qdisc2, NULL, NULL)); + q = qdisc_priv(qdisc); + q2 = qdisc_priv(qdisc2); + EXPECT_EQ(40000, q->qdev->link_mbps); + EXPECT_EQ(16000, q2->qdev->link_mbps); + + self->homa.link_mbps = 25000; + mock_link_mbps = 8000; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + + EXPECT_EQ(8000, q->qdev->link_mbps); + EXPECT_EQ(25000, q2->qdev->link_mbps); + + homa_qdisc_destroy(qdisc); + kfree(qdisc); + homa_qdisc_destroy(qdisc2); + kfree(qdisc2); +} + +/* Inline functions in homa_qdisc.h: */ + +TEST_F(homa_qdisc, homa_qdisc_precedes__bytes_left) +{ + struct homa_rpc *srpc1, *srpc2, *srpc3; + + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 4, 10000, 10000); + + srpc1->qrpc.tx_left = 5000; + srpc2->qrpc.tx_left = 3000; + srpc3->qrpc.tx_left = 7000; + EXPECT_EQ(0, homa_qdisc_precedes(srpc1, srpc2)); + EXPECT_EQ(1, homa_qdisc_precedes(srpc1, srpc3)); +} +TEST_F(homa_qdisc, homa_qdisc_precedes__init_time) +{ + struct homa_rpc *srpc1, *srpc2, *srpc3; + + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc1->msgout.init_time = 1000; + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + srpc2->msgout.init_time = 500; + srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 4, 10000, 10000); + srpc3->msgout.init_time = 2000; + + EXPECT_EQ(0, homa_qdisc_precedes(srpc1, srpc2)); + EXPECT_EQ(1, homa_qdisc_precedes(srpc1, srpc3)); +} +TEST_F(homa_qdisc, homa_qdisc_precedes__rpc_struct_address) +{ + struct homa_rpc *srpc1, *srpc2, *srpc3; + int result; + + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 4, 10000, 10000); + + if (srpc1 > srpc2) + result = homa_qdisc_precedes(srpc1, srpc2); + else + result = homa_qdisc_precedes(srpc2, srpc1); + EXPECT_EQ(0, result); + if (srpc1 < srpc3) + result = homa_qdisc_precedes(srpc1, srpc3); + else + result = homa_qdisc_precedes(srpc3, srpc1); + EXPECT_EQ(1, result); +} diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c new file mode 100644 index 00000000..9cd00841 --- /dev/null +++ b/test/unit_homa_rpc.c @@ -0,0 +1,1092 @@ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +#include "homa_impl.h" +#include "homa_grant.h" +#include "homa_peer.h" +#include "homa_pool.h" +#define KSELFTEST_NOT_MAIN 1 +#include "kselftest_harness.h" +#include "ccutils.h" +#include "mock.h" +#include "utils.h" + +#ifndef __STRIP__ /* See strip.py */ +#include "homa_pacer.h" +#endif /* See strip.py */ + +#define n(x) htons(x) +#define N(x) htonl(x) + +FIXTURE(homa_rpc) { + struct in6_addr client_ip[1]; + int client_port; + struct in6_addr server_ip[1]; + int server_port; + u64 client_id; + u64 server_id; + struct homa homa; + struct homa_net *hnet; + struct homa_sock hsk; + union sockaddr_in_union server_addr; + struct homa_data_hdr data; + struct homa_rpc *crpc; + struct iovec iovec; + struct iov_iter iter; +}; +FIXTURE_SETUP(homa_rpc) +{ + self->client_ip[0] = unit_get_in_addr("196.168.0.1"); + self->client_port = 40000; + self->server_ip[0] = unit_get_in_addr("1.2.3.4"); + self->server_port = 99; + self->client_id = 1234; + self->server_id = 1235; + self->server_addr.in6.sin6_family = AF_INET; + self->server_addr.in6.sin6_addr = *self->server_ip; + self->server_addr.in6.sin6_port = htons(self->server_port); + homa_init(&self->homa); + self->hnet = mock_hnet(0, &self->homa); +#ifndef __STRIP__ /* See strip.py */ + self->homa.unsched_bytes = 10000; + self->homa.grant->window = 10000; +#endif /* See strip.py */ + mock_sock_init(&self->hsk, self->hnet, 0); + memset(&self->data, 0, sizeof(self->data)); + self->data.common = (struct homa_common_hdr){ + .sport = htons(self->client_port), + .dport = htons(self->server_port), + .type = DATA, + .sender_id = cpu_to_be64(self->client_id) + }; + self->data.message_length = htonl(10000); +#ifndef __STRIP__ /* See strip.py */ + self->data.incoming = htonl(10000); +#endif /* See strip.py */ + self->iovec.iov_base = (void *) 2000; + self->iovec.iov_len = 10000; + iov_iter_init(&self->iter, WRITE, &self->iovec, 1, self->iovec.iov_len); + unit_log_clear(); +} +FIXTURE_TEARDOWN(homa_rpc) +{ + homa_destroy(&self->homa); + unit_teardown(); +} + +/** + * dead_rpcs() - Logs the ids for all of the RPCS in hsk->dead_rpcs. + * @hsk: Homa socket to check for dead RPCs. + * + * Return: the contents of the unit test log. + */ +static const char *dead_rpcs(struct homa_sock *hsk) +{ + struct homa_rpc *rpc; + + list_for_each_entry_rcu(rpc, &hsk->dead_rpcs, dead_links) + unit_log_printf(" ", "%llu", rpc->id); + return unit_log_get(); +} + +TEST_F(homa_rpc, homa_rpc_alloc_client__normal) +{ + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, + &self->server_addr); + + ASSERT_FALSE(IS_ERR(crpc)); + homa_rpc_end(crpc); + homa_rpc_unlock(crpc); +} +TEST_F(homa_rpc, homa_rpc_alloc_client__malloc_error) +{ + struct homa_rpc *crpc; + + mock_kmalloc_errors = 1; + crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); + EXPECT_TRUE(IS_ERR(crpc)); + EXPECT_STREQ("couldn't allocate memory for client RPC", self->hsk.error_msg); + EXPECT_EQ(ENOMEM, -PTR_ERR(crpc)); +} +TEST_F(homa_rpc, homa_rpc_alloc_client__route_error) +{ + struct homa_rpc *crpc; + + mock_route_errors = 1; + crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); + EXPECT_TRUE(IS_ERR(crpc)); + EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(crpc)); + EXPECT_STREQ("couldn't find route for peer", self->hsk.error_msg); +} +TEST_F(homa_rpc, homa_rpc_alloc_client__socket_shutdown) +{ + struct homa_rpc *crpc; + + self->hsk.shutdown = 1; + crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); + EXPECT_TRUE(IS_ERR(crpc)); + EXPECT_EQ(ESHUTDOWN, -PTR_ERR(crpc)); + EXPECT_STREQ("socket has been shut down", self->hsk.error_msg); + self->hsk.shutdown = 0; +} + +TEST_F(homa_rpc, homa_rpc_alloc_server__normal) +{ + struct homa_rpc *srpc; + int created; + + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, + &created); + ASSERT_FALSE(IS_ERR(srpc)); + homa_rpc_unlock(srpc); + self->data.message_length = N(1600); + homa_data_pkt(mock_skb_alloc(self->client_ip, &self->data.common, + 1400, 0), srpc); + EXPECT_EQ(RPC_INCOMING, srpc->state); + EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); + EXPECT_EQ(1, created); + homa_rpc_end(srpc); +} +TEST_F(homa_rpc, homa_rpc_alloc_server__no_buffer_pool) +{ + struct homa_rpc *srpc; + int created; + + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = NULL; + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, + &created); + EXPECT_TRUE(IS_ERR(srpc)); + EXPECT_EQ(ENOMEM, -PTR_ERR(srpc)); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); +} +TEST_F(homa_rpc, homa_rpc_alloc_server__already_exists) +{ + struct homa_rpc *srpc1, *srpc2, *srpc3; + int created; + + srpc1 = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, + &created); + ASSERT_FALSE(IS_ERR(srpc1)); + homa_rpc_unlock(srpc1); + self->data.common.sender_id = cpu_to_be64( + be64_to_cpu(self->data.common.sender_id) + + 2*HOMA_SERVER_RPC_BUCKETS); + srpc2 = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, + &created); + ASSERT_FALSE(IS_ERR(srpc2)); + EXPECT_EQ(1, created); + homa_rpc_unlock(srpc2); + EXPECT_NE(srpc2, srpc1); + self->data.common.sender_id = cpu_to_be64( + be64_to_cpu(self->data.common.sender_id) + - 2*HOMA_SERVER_RPC_BUCKETS); + srpc3 = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, + &created); + ASSERT_FALSE(IS_ERR(srpc3)); + EXPECT_EQ(0, created); + homa_rpc_unlock(srpc3); + EXPECT_EQ(srpc3, srpc1); +} +TEST_F(homa_rpc, homa_rpc_alloc_server__malloc_error) +{ + struct homa_rpc *srpc; + int created; + + mock_kmalloc_errors = 1; + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, + &created); + EXPECT_TRUE(IS_ERR(srpc)); + EXPECT_EQ(ENOMEM, -PTR_ERR(srpc)); +} +TEST_F(homa_rpc, homa_rpc_alloc_server__addr_error) +{ + struct homa_rpc *srpc; + int created; + + mock_route_errors = 1; + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, + &created); + EXPECT_TRUE(IS_ERR(srpc)); + EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(srpc)); +} +TEST_F(homa_rpc, homa_rpc_alloc_server__socket_shutdown) +{ + struct homa_rpc *srpc; + int created; + + self->hsk.shutdown = 1; + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, + &created); + EXPECT_TRUE(IS_ERR(srpc)); + EXPECT_EQ(ESHUTDOWN, -PTR_ERR(srpc)); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); + self->hsk.shutdown = 0; +} +TEST_F(homa_rpc, homa_rpc_alloc_server__allocate_buffers) +{ + struct homa_rpc *srpc; + int created; + + self->data.message_length = N(3*HOMA_BPAGE_SIZE); + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, + &created); + ASSERT_FALSE(IS_ERR(srpc)); + homa_rpc_unlock(srpc); + EXPECT_EQ(3, srpc->msgin.num_bpages); + homa_rpc_end(srpc); +} +TEST_F(homa_rpc, homa_rpc_alloc_server__cant_allocate_buffers) +{ + struct homa_rpc *srpc; + int created; + + self->data.message_length = N(1400); + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, + &created); + ASSERT_TRUE(IS_ERR(srpc)); + EXPECT_EQ(ENOMEM, -PTR_ERR(srpc)); +} +TEST_F(homa_rpc, homa_rpc_alloc_server__handoff_rpc) +{ + struct homa_rpc *srpc; + int created; + + self->data.message_length = N(1400); + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, + &created); + ASSERT_FALSE(IS_ERR(srpc)); + homa_rpc_unlock(srpc); + EXPECT_EQ(RPC_INCOMING, srpc->state); + EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); + EXPECT_EQ(1, unit_list_length(&self->hsk.ready_rpcs)); + homa_rpc_end(srpc); +} +TEST_F(homa_rpc, homa_rpc_alloc_server__dont_handoff_no_buffers) +{ + struct homa_rpc *srpc; + int created; + + self->data.message_length = N(1400); + atomic_set(&self->hsk.buffer_pool->free_bpages, 0); + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, + &created); + ASSERT_FALSE(IS_ERR(srpc)); + homa_rpc_unlock(srpc); + EXPECT_EQ(0, unit_list_length(&self->hsk.ready_rpcs)); + homa_rpc_end(srpc); +} +TEST_F(homa_rpc, homa_rpc_alloc_server__dont_handoff_rpc) +{ + struct homa_rpc *srpc; + int created; + + self->data.message_length = N(2800); + self->data.seg.offset = N(1400); + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, + &created); + ASSERT_FALSE(IS_ERR(srpc)); + homa_rpc_unlock(srpc); + EXPECT_EQ(RPC_INCOMING, srpc->state); + EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); + EXPECT_EQ(0, unit_list_length(&self->hsk.ready_rpcs)); + homa_rpc_end(srpc); +} + +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_rpc, homa_bucket_lock_slow) +{ + struct homa_rpc *crpc, *srpc; + int created; + + mock_clock_tick = 10; + crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); + ASSERT_FALSE(IS_ERR(crpc)); + homa_rpc_end(crpc); + homa_rpc_unlock(crpc); + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, + &created); + ASSERT_FALSE(IS_ERR(srpc)); + homa_rpc_unlock(srpc); + + EXPECT_EQ(0, homa_metrics_per_cpu()->client_lock_misses); + EXPECT_EQ(0, homa_metrics_per_cpu()->client_lock_miss_cycles); + homa_bucket_lock_slow(crpc->bucket, crpc->id); + homa_rpc_unlock(crpc); + EXPECT_EQ(1, homa_metrics_per_cpu()->client_lock_misses); + EXPECT_NE(0, homa_metrics_per_cpu()->client_lock_miss_cycles); + EXPECT_EQ(0, homa_metrics_per_cpu()->server_lock_misses); + EXPECT_EQ(0, homa_metrics_per_cpu()->server_lock_miss_cycles); + homa_bucket_lock_slow(srpc->bucket, srpc->id); + homa_rpc_unlock(srpc); + EXPECT_EQ(1, homa_metrics_per_cpu()->server_lock_misses); + EXPECT_EQ(10, homa_metrics_per_cpu()->server_lock_miss_cycles); +} +#endif /* See strip.py */ + +TEST_F(homa_rpc, homa_rpc_acked__basics) +{ + struct homa_rpc *srpc; + struct homa_sock hsk; + struct homa_ack ack = {}; + + mock_sock_init(&hsk, self->hnet, self->server_port); + srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 100, 3000); + ASSERT_NE(NULL, srpc); + ack.server_port = htons(self->server_port); + ack.client_id = cpu_to_be64(self->client_id); + homa_rpc_acked(&hsk, self->client_ip, &ack); + EXPECT_EQ(0, unit_list_length(&hsk.active_rpcs)); + EXPECT_EQ(1, unit_list_length(&hsk.dead_rpcs)); + EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); + unit_sock_destroy(&hsk); +} +TEST_F(homa_rpc, homa_rpc_acked__lookup_socket) +{ + struct homa_ack ack = {}; + struct homa_rpc *srpc; + struct homa_sock hsk; + + mock_sock_init(&hsk, self->hnet, self->server_port); + srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 100, 3000); + ASSERT_NE(NULL, srpc); + ack.server_port = htons(self->server_port); + ack.client_id = cpu_to_be64(self->client_id); + homa_rpc_acked(&self->hsk, self->client_ip, &ack); + EXPECT_EQ(0, unit_list_length(&hsk.active_rpcs)); + EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); + unit_sock_destroy(&hsk); +} +TEST_F(homa_rpc, homa_rpc_acked__no_such_socket) +{ + struct homa_ack ack = {}; + struct homa_rpc *srpc; + struct homa_sock hsk; + + mock_sock_init(&hsk, self->hnet, self->server_port); + srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 100, 3000); + ASSERT_NE(NULL, srpc); + ack.server_port = htons(self->server_port+1); + ack.client_id = cpu_to_be64(self->client_id); + homa_rpc_acked(&hsk, self->client_ip, &ack); + EXPECT_EQ(1, unit_list_length(&hsk.active_rpcs)); + EXPECT_STREQ("OUTGOING", homa_symbol_for_state(srpc)); + unit_sock_destroy(&hsk); +} +TEST_F(homa_rpc, homa_rpc_acked__no_such_rpc) +{ + struct homa_ack ack = {}; + struct homa_rpc *srpc; + struct homa_sock hsk; + + mock_sock_init(&hsk, self->hnet, self->server_port); + srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 100, 3000); + ASSERT_NE(NULL, srpc); + ack.server_port = htons(self->server_port); + ack.client_id = cpu_to_be64(self->client_id+10); + homa_rpc_acked(&hsk, self->client_ip, &ack); + EXPECT_EQ(1, unit_list_length(&hsk.active_rpcs)); + EXPECT_STREQ("OUTGOING", homa_symbol_for_state(srpc)); + unit_sock_destroy(&hsk); +} + +TEST_F(homa_rpc, homa_rpc_end__basics) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 1000, 20000); + +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(1, self->homa.grant->num_grantable_rpcs); +#endif /* See strip.py */ + ASSERT_NE(NULL, crpc); + unit_log_clear(); + mock_log_rcu_sched = 1; + homa_rpc_end(crpc); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(0, self->homa.grant->num_grantable_rpcs); +#endif /* See strip.py */ + EXPECT_EQ(NULL, homa_rpc_find_client(&self->hsk, crpc->id)); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); + EXPECT_EQ(1, unit_list_length(&self->hsk.dead_rpcs)); +} +TEST_F(homa_rpc, homa_rpc_end__already_dead) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, + self->server_port, self->client_id, 1000, 100); + + ASSERT_NE(NULL, crpc); + unit_log_clear(); + homa_rpc_end(crpc); + EXPECT_STREQ("homa_rpc_end invoked", + unit_log_get()); + unit_log_clear(); + homa_rpc_end(crpc); + EXPECT_STREQ("", unit_log_get()); +} +TEST_F(homa_rpc, homa_rpc_end__remove_from_ready_rpcs) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, + self->server_port, self->client_id, 1000, 100); + + ASSERT_NE(NULL, crpc); + EXPECT_EQ(1, unit_list_length(&self->hsk.ready_rpcs)); + homa_rpc_end(crpc); + EXPECT_EQ(0, unit_list_length(&self->hsk.ready_rpcs)); +} +TEST_F(homa_rpc, homa_rpc_end__state_ready) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, + self->server_port, self->client_id, 1000, 100); + + ASSERT_NE(NULL, crpc); + EXPECT_EQ(1, unit_list_length(&self->hsk.ready_rpcs)); + homa_rpc_end(crpc); + EXPECT_EQ(0, unit_list_length(&self->hsk.ready_rpcs)); +} +TEST_F(homa_rpc, homa_rpc_end__free_gaps) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, 99, 1000, 1000); + +#ifndef __STRIP__ /* See strip.py */ + homa_message_in_init(crpc, 10000, 0); +#else /* See strip.py */ + homa_message_in_init(crpc, 10000); +#endif /* See strip.py */ + unit_log_clear(); + self->data.seg.offset = htonl(1400); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 1400)); + + self->data.seg.offset = htonl(4200); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 4200)); + EXPECT_STREQ("start 0, end 1400; start 2800, end 4200", + unit_print_gaps(crpc)); + + homa_rpc_end(crpc); + /* (Test infrastructure will complain if gaps aren't freed) */ +} +TEST_F(homa_rpc, homa_rpc_end__dead_buffs) +{ + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, + self->server_port, self->client_id, 10000, 1000); + + ASSERT_NE(NULL, crpc1); + homa_rpc_end(crpc1); + EXPECT_EQ(9, self->homa.max_dead_buffs); + EXPECT_EQ(9, self->hsk.dead_skbs); + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, + self->server_port, self->client_id+2, 5000, 1000); + ASSERT_NE(NULL, crpc2); + homa_rpc_end(crpc2); + EXPECT_EQ(14, self->homa.max_dead_buffs); + EXPECT_EQ(14, self->hsk.dead_skbs); +} +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_rpc, homa_rpc_end__remove_from_throttled_list) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 10000, 1000); + + homa_pacer_manage_rpc(crpc); + EXPECT_EQ(1, unit_list_length(&self->homa.pacer->throttled_rpcs)); + unit_log_clear(); + homa_rpc_end(crpc); + EXPECT_EQ(0, unit_list_length(&self->homa.pacer->throttled_rpcs)); +} +#endif /* See strip.py */ + +TEST_F(homa_rpc, homa_rpc_reap__nothing_to_reap) +{ + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); +} +TEST_F(homa_rpc, homa_rpc_reap__basics) +{ + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 2000); + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id+2, 5000, 100); + struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id+4, 2000, 100); + + ASSERT_NE(NULL, crpc1); + ASSERT_NE(NULL, crpc2); + ASSERT_NE(NULL, crpc3); + homa_rpc_end(crpc1); + homa_rpc_end(crpc2); + homa_rpc_end(crpc3); + unit_log_clear(); + EXPECT_STREQ("1234 1236 1238", dead_rpcs(&self->hsk)); + EXPECT_EQ(11, self->hsk.dead_skbs); + unit_log_clear(); + self->homa.reap_limit = 7; + EXPECT_EQ(1, homa_rpc_reap(&self->hsk, false)); + EXPECT_STREQ("reaped 1234", unit_log_get()); + unit_log_clear(); + EXPECT_STREQ("1236 1238", dead_rpcs(&self->hsk)); + EXPECT_EQ(3, self->hsk.dead_skbs); +} +TEST_F(homa_rpc, homa_rpc_reap__reap_all) +{ + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 2000); + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id+2, 20000, 100); + + ASSERT_NE(NULL, crpc1); + ASSERT_NE(NULL, crpc2); + homa_rpc_end(crpc1); + homa_rpc_end(crpc2); + unit_log_clear(); + EXPECT_STREQ("1234 1236", dead_rpcs(&self->hsk)); + self->homa.reap_limit = 3; + unit_log_clear(); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, true)); + EXPECT_STREQ("reaped 1234; reaped 1236", unit_log_get()); + unit_log_clear(); + EXPECT_STREQ("", dead_rpcs(&self->hsk)); + EXPECT_EQ(0, self->hsk.dead_skbs); +} +TEST_F(homa_rpc, homa_rpc_reap__protected) +{ + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 2000); + + ASSERT_NE(NULL, crpc1); + homa_rpc_end(crpc1); + unit_log_clear(); + homa_protect_rpcs(&self->hsk); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); + homa_unprotect_rpcs(&self->hsk); + EXPECT_STREQ("", unit_log_get()); +} +TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_locked) +{ + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 1000, 2000); + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id+2, 1000, 2000); + + ASSERT_NE(NULL, crpc1); + ASSERT_NE(NULL, crpc2); + homa_rpc_end(crpc1); + homa_rpc_end(crpc2); + unit_log_clear(); + self->homa.reap_limit = 3; +#ifndef __STRIP__ /* See strip.py */ + mock_trylock_errors = 2; +#else /* See strip.py */ + mock_trylock_errors = 1; +#endif /* See strip.py */ + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); + EXPECT_STREQ("reaped 1236", unit_log_get()); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->deferred_rpc_reaps)); + unit_log_clear(); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->deferred_rpc_reaps)); + EXPECT_STREQ("reaped 1234", unit_log_get()); +} +TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_refs) +{ + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 1000, 2000); + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id+2, 1000, 2000); + + ASSERT_NE(NULL, crpc1); + ASSERT_NE(NULL, crpc2); + homa_rpc_end(crpc1); + homa_rpc_end(crpc2); + unit_log_clear(); + homa_rpc_hold(crpc1); + self->homa.reap_limit = 3; + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); + EXPECT_STREQ("reaped 1236", unit_log_get()); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->deferred_rpc_reaps)); + unit_log_clear(); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); + IF_NO_STRIP(EXPECT_EQ(2, homa_metrics_per_cpu()->deferred_rpc_reaps)); + EXPECT_STREQ("", unit_log_get()); + homa_rpc_put(crpc1); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); + EXPECT_STREQ("reaped 1234", unit_log_get()); + IF_NO_STRIP(EXPECT_EQ(2, homa_metrics_per_cpu()->deferred_rpc_reaps)); +} +TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_skb_refcount) +{ + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 2000); + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id+2, 1000, 2000); + + ASSERT_NE(NULL, crpc1); + ASSERT_NE(NULL, crpc2); + homa_rpc_end(crpc1); + homa_rpc_end(crpc2); + skb_get(crpc1->msgout.packets); + EXPECT_EQ(5, self->hsk.dead_skbs); + unit_log_clear(); + + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); + EXPECT_STREQ("reaped 1236", unit_log_get()); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->reaper_active_skbs)); + EXPECT_EQ(4, self->hsk.dead_skbs); + + kfree_skb(crpc1->msgout.to_free); + unit_log_clear(); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); + EXPECT_STREQ("reaped 1234", unit_log_get()); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->reaper_active_skbs)); + EXPECT_EQ(0, self->hsk.dead_skbs); +} +TEST_F(homa_rpc, homa_rpc_reap__hit_limit_in_msgout_packets) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, + self->server_port, self->client_id, 10000, 100); + + ASSERT_NE(NULL, crpc); + homa_rpc_end(crpc); + EXPECT_EQ(9, self->hsk.dead_skbs); + unit_log_clear(); + self->homa.reap_limit = 5; + homa_rpc_reap(&self->hsk, false); + EXPECT_STREQ("1234", dead_rpcs(&self->hsk)); + EXPECT_EQ(4, self->hsk.dead_skbs); +} +TEST_F(homa_rpc, homa_rpc_reap__skb_memory_accounting) +{ + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 2000); + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id+2, 5000, 100); + + ASSERT_NE(NULL, crpc1); + ASSERT_NE(NULL, crpc2); + crpc1->msgout.skb_memory = 2000; + crpc2->msgout.skb_memory = 3000; + homa_rpc_end(crpc1); + homa_rpc_end(crpc2); + unit_log_clear(); + EXPECT_STREQ("1234 1236", dead_rpcs(&self->hsk)); + refcount_set(&self->hsk.sock.sk_wmem_alloc, 5001); + EXPECT_EQ(9, self->hsk.dead_skbs); + unit_log_clear(); + self->homa.reap_limit = 7; + EXPECT_EQ(1, homa_rpc_reap(&self->hsk, false)); + EXPECT_STREQ("reaped 1234", unit_log_get()); + unit_log_clear(); + EXPECT_STREQ("1236", dead_rpcs(&self->hsk)); + EXPECT_EQ(1, self->hsk.dead_skbs); + EXPECT_EQ(3001, refcount_read(&self->hsk.sock.sk_wmem_alloc)); +} +TEST_F(homa_rpc, homa_rpc_reap__release_buffers) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + 4000, 98, 1000, 150000); + struct homa_pool *pool = self->hsk.buffer_pool; + + ASSERT_NE(NULL, crpc); + EXPECT_EQ(1, atomic_read(&pool->descriptors[1].refs)); + homa_rpc_end(crpc); + EXPECT_EQ(1, atomic_read(&pool->descriptors[1].refs)); + self->hsk.buffer_pool->check_waiting_invoked = 0; + self->homa.reap_limit = 5; + homa_rpc_reap(&self->hsk, false); + EXPECT_EQ(0, atomic_read(&pool->descriptors[1].refs)); + EXPECT_EQ(1, self->hsk.buffer_pool->check_waiting_invoked); +} +TEST_F(homa_rpc, homa_rpc_reap__free_gaps) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + 4000, 98, 1000, 150000); + + ASSERT_NE(NULL, crpc); + homa_gap_alloc(&crpc->msgin.gaps, 1000, 2000); + mock_clock = 1000; + homa_gap_alloc(&crpc->msgin.gaps, 5000, 6000); + + EXPECT_STREQ("start 1000, end 2000; start 5000, end 6000, time 1000", + unit_print_gaps(crpc)); + homa_rpc_end(crpc); + self->homa.reap_limit = 5; + homa_rpc_reap(&self->hsk, false); + // Test framework will complain if memory not freed. +} +TEST_F(homa_rpc, homa_rpc_reap__release_peer_ref) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + 4000, 98, 1000, 150000); + struct homa_peer *peer; + + ASSERT_NE(NULL, crpc); + peer = crpc->peer; + EXPECT_EQ(2, refcount_read(&peer->refs)); + + homa_rpc_end(crpc); + homa_rpc_reap(&self->hsk, false); + EXPECT_EQ(1, refcount_read(&peer->refs)); + EXPECT_EQ(NULL, crpc->peer); +} +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_rpc, homa_rpc_reap__metrics_for_client_response) +{ + struct homa_rpc *crpc, *crpc2; + + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, 4000, 98, 4000, 10000); + ASSERT_NE(NULL, crpc); + EXPECT_EQ(1400, homa_metrics_per_cpu()->client_response_bytes_done); + + homa_rpc_end(crpc); + homa_rpc_reap(&self->hsk, false); + EXPECT_EQ(10000, homa_metrics_per_cpu()->client_response_bytes_done); + EXPECT_EQ(1, homa_metrics_per_cpu()->client_responses_done); + + /* Second RPC has already completed, so no need to increment metrics. */ + homa_metrics_per_cpu()->client_response_bytes_done = 0; + homa_metrics_per_cpu()->client_responses_done = 0; + crpc2 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, 4000, 98, 4000, 1400); + ASSERT_NE(NULL, crpc2); + EXPECT_EQ(1400, homa_metrics_per_cpu()->client_response_bytes_done); + EXPECT_EQ(1, homa_metrics_per_cpu()->client_responses_done); + + homa_rpc_end(crpc2); + homa_rpc_reap(&self->hsk, false); + EXPECT_EQ(1400, homa_metrics_per_cpu()->client_response_bytes_done); + EXPECT_EQ(1, homa_metrics_per_cpu()->client_responses_done); +} +TEST_F(homa_rpc, homa_rpc_reap__metrics_for_client_request) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, 4000, 98, 4000, 10000); + ASSERT_NE(NULL, crpc); + crpc->msgout.granted = 1000; + homa_rpc_lock(crpc); + homa_xmit_data(crpc, false); + homa_rpc_unlock(crpc); + EXPECT_EQ(1400, homa_metrics_per_cpu()->client_request_bytes_done); + EXPECT_EQ(0, homa_metrics_per_cpu()->client_requests_done); + + homa_rpc_end(crpc); + homa_rpc_reap(&self->hsk, false); + EXPECT_EQ(4000, homa_metrics_per_cpu()->client_request_bytes_done); + EXPECT_EQ(1, homa_metrics_per_cpu()->client_requests_done); +} +TEST_F(homa_rpc, homa_rpc_reap__metrics_for_server_request) +{ + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, self->client_port, + self->server_id, 5000, 10000); + ASSERT_NE(NULL, srpc); + EXPECT_EQ(1400, homa_metrics_per_cpu()->server_request_bytes_done); + EXPECT_EQ(0, homa_metrics_per_cpu()->server_requests_done); + + homa_rpc_end(srpc); + homa_rpc_reap(&self->hsk, false); + EXPECT_EQ(5000, homa_metrics_per_cpu()->server_request_bytes_done); + EXPECT_EQ(1, homa_metrics_per_cpu()->server_requests_done); +} +TEST_F(homa_rpc, homa_rpc_reap__metrics_for_server_response) +{ + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->client_port, + self->server_id, 5000, 10000); + ASSERT_NE(NULL, srpc); + srpc->msgout.granted = 1000; + homa_rpc_lock(srpc); + homa_xmit_data(srpc, false); + homa_rpc_unlock(srpc); + EXPECT_EQ(1400, homa_metrics_per_cpu()->server_response_bytes_done); + EXPECT_EQ(0, homa_metrics_per_cpu()->server_responses_done); + + homa_rpc_end(srpc); + homa_rpc_reap(&self->hsk, false); + EXPECT_EQ(10000, homa_metrics_per_cpu()->server_response_bytes_done); + EXPECT_EQ(1, homa_metrics_per_cpu()->server_responses_done); +} +#endif /* See strip.py */ +TEST_F(homa_rpc, homa_rpc_reap__call_homa_sock_wakeup_wmem) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + 4000, 98, 1000, 150000); + + ASSERT_NE(NULL, crpc); + homa_rpc_end(crpc); + set_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags); + homa_rpc_reap(&self->hsk, false); + EXPECT_EQ(0, test_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags)); +} + +TEST_F(homa_rpc, homa_rpc_find_client) +{ + struct homa_rpc *crpc1, *crpc2, *crpc3, *crpc4; + + atomic64_set(&self->homa.next_outgoing_id, 3); + crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 10000, 1000); + atomic64_set(&self->homa.next_outgoing_id, 3 + 3*HOMA_CLIENT_RPC_BUCKETS); + crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id+2, + 10000, 1000); + atomic64_set(&self->homa.next_outgoing_id, + 3 + 10*HOMA_CLIENT_RPC_BUCKETS); + crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id+4, + 10000, 1000); + atomic64_set(&self->homa.next_outgoing_id, 40); + crpc4 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id+6, + 10000, 1000); + + EXPECT_EQ(crpc1, homa_rpc_find_client(&self->hsk, crpc1->id)); + homa_rpc_unlock(crpc1); + EXPECT_EQ(crpc2, homa_rpc_find_client(&self->hsk, crpc2->id)); + homa_rpc_unlock(crpc2); + EXPECT_EQ(crpc3, homa_rpc_find_client(&self->hsk, crpc3->id)); + homa_rpc_unlock(crpc3); + EXPECT_EQ(crpc4, homa_rpc_find_client(&self->hsk, crpc4->id)); + homa_rpc_unlock(crpc4); + EXPECT_EQ(NULL, homa_rpc_find_client(&self->hsk, 15)); + homa_rpc_end(crpc1); + homa_rpc_end(crpc2); + homa_rpc_end(crpc3); + homa_rpc_end(crpc4); +} + +TEST_F(homa_rpc, homa_rpc_find_server) +{ + struct homa_rpc *srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 10000, 100); + struct homa_rpc *srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, + self->client_ip, self->server_ip, self->client_port, + self->server_id + 30*HOMA_SERVER_RPC_BUCKETS, + 10000, 100); + struct homa_rpc *srpc3 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, + self->client_ip, self->server_ip, self->client_port+1, + self->server_id + 10*HOMA_SERVER_RPC_BUCKETS, + 10000, 100); + struct homa_rpc *srpc4 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, + self->client_ip, self->server_ip, self->client_port+1, + self->server_id + 4, 10000, 100); + + ASSERT_NE(NULL, srpc1); + ASSERT_NE(NULL, srpc2); + ASSERT_NE(NULL, srpc3); + ASSERT_NE(NULL, srpc4); + EXPECT_EQ(srpc1, homa_rpc_find_server(&self->hsk, self->client_ip, + srpc1->id)); + homa_rpc_unlock(srpc1); + EXPECT_EQ(srpc2, homa_rpc_find_server(&self->hsk, self->client_ip, + srpc2->id)); + homa_rpc_unlock(srpc2); + EXPECT_EQ(srpc3, homa_rpc_find_server(&self->hsk, self->client_ip, + srpc3->id)); + homa_rpc_unlock(srpc3); + EXPECT_EQ(srpc4, homa_rpc_find_server(&self->hsk, self->client_ip, + srpc4->id)); + homa_rpc_unlock(srpc4); + EXPECT_EQ(NULL, homa_rpc_find_server(&self->hsk, self->client_ip, 3)); +} + +TEST_F(homa_rpc, homa_rpc_get_info__basics) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 1000, 20000); + struct homa_rpc_info info; + + crpc->completion_cookie = 1111; + + homa_rpc_get_info(crpc, &info); + EXPECT_EQ(AF_INET6, info.peer.in6.sin6_family); + EXPECT_EQ(0, info.peer.in6.sin6_addr.in6_u.u6_addr32[0]); + EXPECT_EQ(0, info.peer.in6.sin6_addr.in6_u.u6_addr32[1]); + EXPECT_EQ(htonl(0x0000ffff), info.peer.in6.sin6_addr.in6_u.u6_addr32[2]); + EXPECT_EQ(0x04030201, info.peer.in6.sin6_addr.in6_u.u6_addr32[3]); + EXPECT_EQ(99, ntohs(info.peer.in6.sin6_port)); + EXPECT_EQ(1234, info.id); + EXPECT_EQ(1111, info.completion_cookie); + EXPECT_EQ(1000, info.tx_length); + EXPECT_EQ(1000, info.tx_sent); + EXPECT_EQ(1000, info.tx_granted); + IF_NO_STRIP(EXPECT_EQ(0, info.tx_prio)); + EXPECT_EQ(20000, info.rx_length); + EXPECT_EQ(18600, info.rx_remaining); + EXPECT_EQ(0, info.rx_gaps); + EXPECT_EQ(0, info.rx_gap_bytes); + IF_NO_STRIP(EXPECT_EQ(11400, info.rx_granted)); +} +TEST_F(homa_rpc, homa_rpc_get_info__ipv4_address) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 1000, 20000); + struct homa_rpc_info info; + + self->hsk.inet.sk.sk_family = AF_INET; + + homa_rpc_get_info(crpc, &info); + EXPECT_EQ(AF_INET, info.peer.in4.sin_family); + EXPECT_EQ(0x04030201, info.peer.in4.sin_addr.s_addr); + EXPECT_EQ(99, ntohs(info.peer.in4.sin_port)); +} +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_rpc, homa_rpc_get_info__tx_incomplete) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 20000); + struct homa_rpc_info info; + + crpc->msgout.granted = 4000; + crpc->msgout.next_xmit_offset = 1400; + crpc->msgout.sched_priority = 5; + + homa_rpc_get_info(crpc, &info); + EXPECT_EQ(5000, info.tx_length); + EXPECT_EQ(1400, info.tx_sent); + EXPECT_EQ(4000, info.tx_granted); + EXPECT_EQ(5, info.tx_prio); +} +#endif /* See strip.py */ +TEST_F(homa_rpc, homa_rpc_get_info__tx_not_started) +{ + struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 10000, 100); + struct homa_rpc_info info; + + homa_rpc_get_info(srpc, &info); + EXPECT_EQ(-1, info.tx_length); +} +TEST_F(homa_rpc, homa_rpc_get_info__rx_gaps) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 20000); + struct homa_rpc_info info; + + homa_gap_alloc(&crpc->msgin.gaps, 1000, 2000); + homa_gap_alloc(&crpc->msgin.gaps, 4000, 6000); + + homa_rpc_get_info(crpc, &info); + EXPECT_EQ(2, info.rx_gaps); + EXPECT_EQ(3000, info.rx_gap_bytes); +} +TEST_F(homa_rpc, homa_rpc_get_info__rx_not_started) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 20000); + struct homa_rpc_info info; + + homa_rpc_get_info(crpc, &info); + EXPECT_EQ(-1, info.rx_length); +} +TEST_F(homa_rpc, homa_rpc_get_info__HOMA_RPC_BUF_STALL) +{ + struct homa_rpc_info info; + struct homa_rpc *crpc; + + atomic_set(&self->hsk.buffer_pool->free_bpages, 0); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 5000, 20000); + + homa_rpc_get_info(crpc, &info); + EXPECT_EQ(1, info.flags); +} +TEST_F(homa_rpc, homa_rpc_get_info__HOMA_RPC_RX_READY_and_HOMA_RPC_RX_COPY) +{ + struct homa_rpc_info info; + struct homa_rpc *srpc; + struct homa_sock hsk; + + mock_sock_init(&hsk, self->hnet, self->server_port); + self->data.message_length = htonl(2400); + srpc = unit_server_rpc(&hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, self->client_port, + self->server_id, 2400, 100); + + /* First call: some bytes haven't been received. */ + homa_rpc_get_info(srpc, &info); + EXPECT_EQ(2400, info.rx_length); + EXPECT_EQ(1000, info.rx_remaining); + EXPECT_EQ(HOMA_RPC_RX_COPY, info.flags); + + /* Second call: all bytes received, but haven't been copied out. */ + self->data.seg.offset = htonl(1400); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, + 1000, 0)); + homa_rpc_get_info(srpc, &info); + EXPECT_EQ(0, info.rx_remaining); + EXPECT_EQ(2, skb_queue_len(&srpc->msgin.packets)); + EXPECT_EQ(HOMA_RPC_RX_COPY, info.flags); + + /* Third call: all bytes copied out. */ + homa_rpc_lock(srpc); + homa_copy_to_user(srpc); + homa_rpc_unlock(srpc); + homa_rpc_get_info(srpc, &info); + EXPECT_EQ(0, skb_queue_len(&srpc->msgin.packets)); + EXPECT_EQ(HOMA_RPC_RX_READY, info.flags); + + unit_sock_destroy(&hsk); +} +TEST_F(homa_rpc, homa_rpc_get_info__HOMA_RPC_PRIVATE) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 20000); + struct homa_rpc_info info; + + crpc->flags |= RPC_PRIVATE; + homa_rpc_get_info(crpc, &info); + EXPECT_EQ(HOMA_RPC_PRIVATE, info.flags); +} diff --git a/test/unit_homa_skb.c b/test/unit_homa_skb.c new file mode 100644 index 00000000..f4a9dfc2 --- /dev/null +++ b/test/unit_homa_skb.c @@ -0,0 +1,765 @@ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +#include "homa_impl.h" +#include "homa_skb.h" +#define KSELFTEST_NOT_MAIN 1 +#include "kselftest_harness.h" +#include "ccutils.h" +#include "mock.h" +#include "utils.h" + +static inline struct homa_skb_core *get_skb_core(int core) +{ + return &per_cpu(homa_skb_core, core); +} + +/* Create an skb with 100 bytes of data in the header and frags of + * 200, 300, and 400 bytes. + */ +static struct sk_buff *test_skb(struct homa *homa) +{ + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); + struct sk_buff *skb = homa_skb_alloc_tx(100); + int32_t data[1000]; + char *src; + int i; + + for (i = 0; i < 1000; i++) + data[i] = 1000000 + 4*i; + src = (char *) data; + memcpy(skb_put(skb, 100), src, 100); + + /* Make sure that the first skb fragment will have a nonzero offset + * within its page. + */ + homa_skb_page_alloc(homa, skb_core); + skb_core->page_inuse = 100; + + homa_skb_append_to_frag(homa, skb, src + 100, 200); + skb_core->page_inuse = skb_core->page_size; + homa_skb_append_to_frag(homa, skb, src + 300, 300); + skb_core->page_inuse = skb_core->page_size; + homa_skb_append_to_frag(homa, skb, src + 600, 400); + + /* Add some data before the transport header, just to make sure + * that functions offset from the proper location. + */ + skb_push(skb, 8); + return skb; +} + +/* Add a given number of pages to the page pool for a given core. */ +static void add_to_pool(struct homa *homa, int num_pages, int core) +{ + struct homa_page_pool *pool = get_skb_core(core)->pool; + int i; + + for (i = 0; i < num_pages; i++) { + pool->pages[pool->avail] = alloc_pages(GFP_KERNEL, + HOMA_SKB_PAGE_ORDER); + pool->avail++; + } +} + +static struct homa_page_pool *hook_pool; + +/* Used to remove a page from hook_pool in a race. */ +static void page_alloc_race_hook(char *id) +{ + if (strcmp(id, "skb_page_alloc_race") != 0) + return; + if ((hook_pool == NULL) || (hook_pool->avail == 0)) + return; + hook_pool->avail--; + put_page(hook_pool->pages[hook_pool->avail]); +} + +FIXTURE(homa_skb) { + struct homa homa; + struct sk_buff *skb; +}; +FIXTURE_SETUP(homa_skb) +{ + homa_init(&self->homa); + self->skb = alloc_skb_fclone(200, GFP_KERNEL); + if (!self->skb) + FAIL("unit_homa_skb setup couldn't allocate skb"); +} +FIXTURE_TEARDOWN(homa_skb) +{ + kfree_skb(self->skb); + homa_destroy(&self->homa); + unit_teardown(); +} + +TEST_F(homa_skb, homa_skb_init__success) +{ + homa_skb_cleanup(&self->homa); + EXPECT_EQ(NULL, self->homa.page_pools[0]); + mock_numa_mask = 0x83; + EXPECT_EQ(0, homa_skb_init(&self->homa)); + EXPECT_NE(NULL, self->homa.page_pools[0]); + EXPECT_NE(NULL, self->homa.page_pools[1]); + EXPECT_EQ(NULL, self->homa.page_pools[2]); + EXPECT_EQ(self->homa.page_pools[1], get_skb_core(0)->pool); + EXPECT_EQ(self->homa.page_pools[1], get_skb_core(1)->pool); + EXPECT_EQ(self->homa.page_pools[0], get_skb_core(2)->pool); + EXPECT_EQ(self->homa.page_pools[0], get_skb_core(6)->pool); + EXPECT_EQ(self->homa.page_pools[1], get_skb_core(7)->pool); + EXPECT_EQ(1, self->homa.max_numa); +} +TEST_F(homa_skb, homa_skb_init__kmalloc_failure) +{ + homa_skb_cleanup(&self->homa); + EXPECT_EQ(NULL, self->homa.page_pools[0]); + mock_numa_mask = 0x2; + mock_kmalloc_errors = 0x2; + EXPECT_EQ(ENOMEM, -homa_skb_init(&self->homa)); + EXPECT_NE(NULL, self->homa.page_pools[0]); + EXPECT_EQ(NULL, self->homa.page_pools[1]); + EXPECT_EQ(NULL, self->homa.page_pools[2]); +} + +TEST_F(homa_skb, homa_skb_cleanup) +{ + struct homa_skb_core *skb_core = get_skb_core(2); + + skb_core->skb_page = alloc_pages(GFP_KERNEL, 2); + add_to_pool(&self->homa, 5, 2); + add_to_pool(&self->homa, 4, 3); + mock_set_core(3); + homa_skb_stash_pages(&self->homa, 2 * HOMA_SKB_PAGE_SIZE - 100); + EXPECT_EQ(5, get_skb_core(2)->pool->avail); + EXPECT_EQ(2, get_skb_core(3)->pool->avail); + EXPECT_EQ(2, get_skb_core(3)->num_stashed_pages); + + homa_skb_cleanup(&self->homa); + EXPECT_EQ(NULL, skb_core->pool); + EXPECT_EQ(NULL, skb_core->skb_page); + EXPECT_EQ(0, get_skb_core(3)->num_stashed_pages); + + skb_core = get_skb_core(nr_cpu_ids-1); + EXPECT_EQ(NULL, skb_core->pool); +} + +TEST_F(homa_skb, homa_skb_stash_pages) +{ + int id = smp_processor_id(); + struct homa_skb_core *skb_core; + + skb_core = get_skb_core(id); + add_to_pool(&self->homa, 5, id); + EXPECT_EQ(5, skb_core->pool->avail); + EXPECT_EQ(0, skb_core->num_stashed_pages); + + /* First attempt: message too small. */ + homa_skb_stash_pages(&self->homa, 10000); + EXPECT_EQ(0, skb_core->num_stashed_pages); + + /* Second attempt: stash pages. */ + homa_skb_stash_pages(&self->homa, 3*HOMA_SKB_PAGE_SIZE - 100); + EXPECT_EQ(3, skb_core->num_stashed_pages); + EXPECT_EQ(2, skb_core->pool->avail); + + /* Third attempt: existing stash adequate. */ + homa_skb_stash_pages(&self->homa, 3 * HOMA_SKB_PAGE_SIZE - 100); + EXPECT_EQ(3, skb_core->num_stashed_pages); + + /* Fourth attempt: not enough pages in pool. */ + homa_skb_stash_pages(&self->homa, 8 * HOMA_SKB_PAGE_SIZE - 100); + EXPECT_EQ(5, skb_core->num_stashed_pages); +} + +TEST_F(homa_skb, homa_skb_extend_frags__basics) +{ + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); + char *p1, *p2, *p3; + int length = 100; + + p1 = homa_skb_extend_frags(&self->homa, self->skb, &length); + EXPECT_EQ(100, length); + EXPECT_NE(NULL, p1); + + length = 200; + p2 = homa_skb_extend_frags(&self->homa, self->skb, &length); + EXPECT_EQ(200, length); + EXPECT_EQ(p1 + 100, p2); + + length = 300; + p3 = homa_skb_extend_frags(&self->homa, self->skb, &length); + EXPECT_EQ(300, length); + EXPECT_EQ(p2 + 200, p3); + + EXPECT_EQ(600, skb_core->page_inuse); + EXPECT_EQ(600, self->skb->len); +} +TEST_F(homa_skb, homa_skb_extend_frags__merge_but_reduce_length) +{ + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); + int length = 1000; + char *p1, *p2; + + p1 = homa_skb_extend_frags(&self->homa, self->skb, &length); + EXPECT_EQ(1000, length); + EXPECT_NE(NULL, p1); + + skb_core->page_size = 2048; + length = 2000; + p2 = homa_skb_extend_frags(&self->homa, self->skb, &length); + EXPECT_EQ(1048, length); + EXPECT_EQ(p1 + 1000, p2); + + EXPECT_EQ(2048, skb_core->page_inuse); +} +TEST_F(homa_skb, homa_skb_extend_frags__cant_merge_allocate_new_page) +{ + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); + struct sk_buff *skb2 = alloc_skb_fclone(200, GFP_KERNEL); + char *p1, *p2, *p3; + int length; + + ASSERT_NE(NULL, skb2); + length = 1000; + p1 = homa_skb_extend_frags(&self->homa, self->skb, &length); + EXPECT_EQ(1000, length); + EXPECT_NE(NULL, p1); + EXPECT_EQ(1000, self->skb->len); + + skb_core->page_size = 2048; + length = 1000; + p2 = homa_skb_extend_frags(&self->homa, skb2, &length); + EXPECT_EQ(1000, length); + EXPECT_EQ(p1 + 1024, p2); + EXPECT_EQ(1000, skb2->len); + + length = 1000; + p3 = homa_skb_extend_frags(&self->homa, self->skb, &length); + EXPECT_NE(NULL, p3); + EXPECT_EQ(1000, length); + EXPECT_EQ(2, skb_shinfo(self->skb)->nr_frags); + EXPECT_EQ(0, skb_shinfo(self->skb)->frags[1].offset); + EXPECT_EQ(2000, self->skb->len); + + EXPECT_EQ(1000, skb_core->page_inuse); + kfree_skb(skb2); +} +TEST_F(homa_skb, homa_skb_extend_frags__cant_merge_use_same_page_reduce_length) +{ + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); + struct sk_buff *skb2 = alloc_skb_fclone(200, GFP_KERNEL); + char *p1, *p2, *p3; + int length; + + ASSERT_NE(NULL, skb2); + length = 1000; + p1 = homa_skb_extend_frags(&self->homa, self->skb, &length); + EXPECT_EQ(1000, length); + EXPECT_NE(NULL, p1); + + skb_core->page_size = 2048; + length = 500; + p2 = homa_skb_extend_frags(&self->homa, skb2, &length); + EXPECT_EQ(500, length); + EXPECT_EQ(p1 + 1024, p2); + + length = 2000; + p3 = homa_skb_extend_frags(&self->homa, self->skb, &length); + EXPECT_EQ(p2 + 512, p3); + EXPECT_EQ(512, length); + EXPECT_EQ(2, skb_shinfo(self->skb)->nr_frags); + EXPECT_EQ(1536, skb_shinfo(self->skb)->frags[1].offset); + + EXPECT_EQ(2048, skb_core->page_inuse); + kfree_skb(skb2); +} + +TEST_F(homa_skb, homa_skb_page_alloc__free_previous_page) +{ + struct homa_skb_core *skb_core = get_skb_core(2); + struct page *old_page; + + EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); + EXPECT_NE(NULL, skb_core->skb_page); + old_page = skb_core->skb_page; + get_page(old_page); + EXPECT_EQ(2, mock_page_refs(old_page)); + EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); + EXPECT_NE(NULL, skb_core->skb_page); + EXPECT_NE(old_page, skb_core->skb_page); + EXPECT_EQ(1, mock_page_refs(old_page)); + put_page(old_page); +} +TEST_F(homa_skb, homa_skb_page_alloc__reuse_existing_page) +{ + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); + struct sk_buff *skb = homa_skb_alloc_tx(100); + struct page *page; + int length = 100; + + homa_skb_extend_frags(&self->homa, skb, &length); + EXPECT_EQ(100, skb_core->page_inuse); + page = skb_core->skb_page; + + homa_skb_free_tx(&self->homa, skb); + EXPECT_EQ(1, page_ref_count(skb_core->skb_page)); + EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); + EXPECT_EQ(page, skb_core->skb_page); + EXPECT_EQ(0, skb_core->page_inuse); +} +TEST_F(homa_skb, homa_skb_page_alloc__from_stash) +{ + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); + + add_to_pool(&self->homa, 5, smp_processor_id()); + homa_skb_stash_pages(&self->homa, 3*HOMA_SKB_PAGE_SIZE - 100); + EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); + EXPECT_NE(NULL, skb_core->skb_page); + EXPECT_EQ(HOMA_SKB_PAGE_SIZE, skb_core->page_size); + EXPECT_EQ(0, skb_core->page_inuse); + EXPECT_EQ(2, skb_core->num_stashed_pages); +} +TEST_F(homa_skb, homa_skb_page_alloc__from_pool) +{ + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); + + add_to_pool(&self->homa, 5, smp_processor_id()); + EXPECT_EQ(5, skb_core->pool->avail); + EXPECT_EQ(0, skb_core->num_stashed_pages); + EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); + EXPECT_NE(NULL, skb_core->skb_page); + EXPECT_EQ(4, skb_core->pool->avail); +} +TEST_F(homa_skb, homa_skb_page_alloc__pool_page_taken_while_locking) +{ + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); + + add_to_pool(&self->homa, 1, smp_processor_id()); + EXPECT_EQ(1, skb_core->pool->avail); + EXPECT_EQ(0, skb_core->num_stashed_pages); + hook_pool = skb_core->pool; + unit_hook_register(page_alloc_race_hook); + mock_alloc_page_errors = 3; + + EXPECT_FALSE(homa_skb_page_alloc(&self->homa, skb_core)); + EXPECT_EQ(NULL, skb_core->skb_page); + EXPECT_EQ(0, skb_core->pool->avail); +} +TEST_F(homa_skb, homa_skb_page_alloc__new_large_page) +{ + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); + + mock_clock_tick = 100; + EXPECT_EQ(0, skb_core->pool->avail); + EXPECT_EQ(0, skb_core->num_stashed_pages); + EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); + EXPECT_NE(NULL, skb_core->skb_page); + EXPECT_EQ(HOMA_SKB_PAGE_SIZE, skb_core->page_size); + EXPECT_EQ(1, homa_metrics_per_cpu()->skb_page_allocs); + EXPECT_EQ(100, homa_metrics_per_cpu()->skb_page_alloc_cycles); +} +TEST_F(homa_skb, homa_skb_page_alloc__high_order_page_not_available) +{ + struct homa_skb_core *skb_core = get_skb_core(2); + + mock_clock_tick = 50; + mock_alloc_page_errors = 1; + EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); + EXPECT_NE(NULL, skb_core->skb_page); + EXPECT_NE(NULL, skb_core->skb_page); + EXPECT_EQ(PAGE_SIZE, skb_core->page_size); + EXPECT_EQ(0, skb_core->page_inuse); + EXPECT_EQ(1, homa_metrics_per_cpu()->skb_page_allocs); + EXPECT_EQ(50, homa_metrics_per_cpu()->skb_page_alloc_cycles); +} +TEST_F(homa_skb, homa_skb_page_alloc__no_pages_available) +{ + struct homa_skb_core *skb_core = get_skb_core(2); + + mock_alloc_page_errors = 3; + EXPECT_FALSE(homa_skb_page_alloc(&self->homa, skb_core)); + EXPECT_EQ(NULL, skb_core->skb_page); +} + +TEST_F(homa_skb, homa_skb_append_to_frag__basics) +{ + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); + struct skb_shared_info *shinfo = skb_shinfo(self->skb); + char *p; + + /* First append fits in a single block. */ + EXPECT_EQ(0, homa_skb_append_to_frag(&self->homa, self->skb, "abcd", 4)); + + /* Second append spills into a new frag. */ + skb_core->page_size = 10; + EXPECT_EQ(0, homa_skb_append_to_frag(&self->homa, self->skb, + "0123456789ABCDEFGHIJ", 21)); + + EXPECT_EQ(2, shinfo->nr_frags); + EXPECT_EQ(10, skb_frag_size(&shinfo->frags[0])); + p = ((char *) page_address(skb_frag_page(&shinfo->frags[0]))) + + shinfo->frags[0].offset; + p[skb_frag_size(&shinfo->frags[0])] = 0; + EXPECT_STREQ("abcd012345", p); + + EXPECT_EQ(15, skb_frag_size(&shinfo->frags[1])); + p = ((char *) page_address(skb_frag_page(&shinfo->frags[1]))) + + shinfo->frags[1].offset; + EXPECT_STREQ("6789ABCDEFGHIJ", p); +} +TEST_F(homa_skb, homa_skb_append_to_frag__no_memory) +{ + mock_alloc_page_errors = 3; + EXPECT_EQ(ENOMEM, -homa_skb_append_to_frag(&self->homa, self->skb, + "abcd", 4)); +} + +TEST_F(homa_skb, homa_skb_append_from_iter__basics) +{ + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); + struct iov_iter *iter = unit_iov_iter((void *) 1000, 5000); + struct skb_shared_info *shinfo = skb_shinfo(self->skb); + + /* First append fits in a single block. */ + unit_log_clear(); + EXPECT_EQ(0, homa_skb_append_from_iter(&self->homa, self->skb, iter, + 2000)); + EXPECT_STREQ("_copy_from_iter 2000 bytes at 1000", + unit_log_get()); + + /* Second append spills into a new frag. */ + skb_core->page_size = 4096; + unit_log_clear(); + EXPECT_EQ(0, homa_skb_append_from_iter(&self->homa, self->skb, iter, + 3000)); + EXPECT_STREQ("_copy_from_iter 2096 bytes at 3000; " + "_copy_from_iter 904 bytes at 5096", + unit_log_get()); + + EXPECT_EQ(2, shinfo->nr_frags); + EXPECT_EQ(4096, skb_frag_size(&shinfo->frags[0])); + EXPECT_EQ(904, skb_frag_size(&shinfo->frags[1])); +} +TEST_F(homa_skb, homa_skb_append_from_iter__no_memory) +{ + struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); + + mock_alloc_page_errors = 3; + EXPECT_EQ(ENOMEM, -homa_skb_append_from_iter(&self->homa, self->skb, + iter, 2000)); +} + +TEST_F(homa_skb, homa_skb_append_from_skb__header_only) +{ + struct sk_buff *src_skb = test_skb(&self->homa); + struct sk_buff *dst_skb = homa_skb_alloc_tx(100); + int32_t data[500]; + + EXPECT_EQ(0, homa_skb_append_from_skb(&self->homa, dst_skb, src_skb, + 20, 60)); + memset(data, 0, sizeof(data)); + homa_skb_get(dst_skb, data, 0, 60); + EXPECT_EQ(1000020, data[0]); + EXPECT_EQ(1000076, data[14]); + + kfree_skb(src_skb); + kfree_skb(dst_skb); +} +TEST_F(homa_skb, homa_skb_append_from_skb__error_copying_header) +{ + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); + struct sk_buff *src_skb = test_skb(&self->homa); + struct sk_buff *dst_skb = homa_skb_alloc_tx(100); + + mock_alloc_page_errors = -1; + skb_core->page_inuse = skb_core->page_size; + EXPECT_EQ(ENOMEM, -homa_skb_append_from_skb(&self->homa, dst_skb, + src_skb, 20, 60)); + + kfree_skb(src_skb); + kfree_skb(dst_skb); +} +TEST_F(homa_skb, homa_skb_append_from_skb__header_and_first_frag) +{ + struct sk_buff *src_skb = test_skb(&self->homa); + struct sk_buff *dst_skb = homa_skb_alloc_tx(100); + struct skb_shared_info *dst_shinfo; + int32_t data[500]; + + dst_shinfo = skb_shinfo(dst_skb); + EXPECT_EQ(0, homa_skb_append_from_skb(&self->homa, dst_skb, src_skb, + 80, 100)); + memset(data, 0, sizeof(data)); + homa_skb_get(dst_skb, data, 0, 100); + EXPECT_EQ(1000080, data[0]); + EXPECT_EQ(1000176, data[24]); + EXPECT_EQ(2, dst_shinfo->nr_frags); + EXPECT_EQ(100, dst_skb->len); + + kfree_skb(src_skb); + kfree_skb(dst_skb); +} +TEST_F(homa_skb, homa_skb_append_from_skb__multiple_frags) +{ + struct sk_buff *src_skb = test_skb(&self->homa); + struct sk_buff *dst_skb = homa_skb_alloc_tx(100); + struct skb_shared_info *dst_shinfo; + int32_t data[500]; + + dst_shinfo = skb_shinfo(dst_skb); + EXPECT_EQ(0, homa_skb_append_from_skb(&self->homa, dst_skb, src_skb, + 320, 600)); + memset(data, 0, sizeof(data)); + homa_skb_get(dst_skb, data, 0, 600); + EXPECT_EQ(1000320, data[0]); + EXPECT_EQ(1000916, data[149]); + EXPECT_EQ(2, dst_shinfo->nr_frags); + EXPECT_EQ(600, dst_skb->len); + + kfree_skb(src_skb); + kfree_skb(dst_skb); +} +TEST_F(homa_skb, homa_skb_append_from_skb__dst_runs_out_of_frags) +{ + struct sk_buff *src_skb = test_skb(&self->homa); + struct sk_buff *dst_skb = homa_skb_alloc_tx(100); + struct skb_shared_info *dst_shinfo; + int i, err; + + dst_shinfo = skb_shinfo(dst_skb); + mock_max_skb_frags = 4; + for (i = 0; i < 10; i++) { + err = homa_skb_append_from_skb(&self->homa, dst_skb, src_skb, + 320, 40); + if (err) + break; + } + EXPECT_EQ(4, i); + EXPECT_EQ(EINVAL, -err); + EXPECT_EQ(4, dst_shinfo->nr_frags); + + kfree_skb(src_skb); + kfree_skb(dst_skb); +} + +TEST_F(homa_skb, homa_skb_free_many_tx__basics) +{ + struct sk_buff *skbs[2]; + int i, length; + + skbs[0] = homa_skb_alloc_tx(100); + for (i = 0; i < 3; i++) { + length = 2*HOMA_SKB_PAGE_SIZE; + homa_skb_extend_frags(&self->homa, skbs[0], &length); + } + EXPECT_EQ(HOMA_SKB_PAGE_SIZE, length); + + skbs[1] = homa_skb_alloc_tx(100); + length = 2 * HOMA_SKB_PAGE_SIZE; + homa_skb_extend_frags(&self->homa, skbs[1], &length); + + homa_skb_free_many_tx(&self->homa, skbs, 2); + EXPECT_EQ(3, self->homa.page_pools[0]->avail); +} +TEST_F(homa_skb, homa_skb_free_many_tx__skb_ref_count_not_one) +{ + struct sk_buff *skb; + struct page *page; + int length; + + skb = homa_skb_alloc_tx(100); + length = HOMA_SKB_PAGE_SIZE; + homa_skb_extend_frags(&self->homa, skb, &length); + EXPECT_EQ(HOMA_SKB_PAGE_SIZE, length); + page = skb_frag_page(&skb_shinfo(skb)->frags[0]); + EXPECT_EQ(2, page_ref_count(page)); + skb_get(skb); + EXPECT_EQ(2, refcount_read(&skb->users)); + + homa_skb_free_many_tx(&self->homa, &skb, 1); + EXPECT_EQ(2, page_ref_count(page)); + EXPECT_EQ(1, refcount_read(&skb->users)); + kfree_skb(skb); +} +TEST_F(homa_skb, homa_skb_free_many_tx__check_page_order) +{ + struct sk_buff *skb; + int i, length; + + skb = homa_skb_alloc_tx(100); + for (i = 0; i < 4; i++) { + length = 2 * HOMA_SKB_PAGE_SIZE; + homa_skb_extend_frags(&self->homa, skb, &length); + } + EXPECT_EQ(HOMA_SKB_PAGE_SIZE, length); + struct page *page = skb_frag_page(&skb_shinfo(skb)->frags[2]); + + mock_compound_order_mask = 3; + homa_skb_free_many_tx(&self->homa, &skb, 1); + EXPECT_EQ(1, self->homa.page_pools[0]->avail); + EXPECT_EQ(page, self->homa.page_pools[0]->pages[0]); +} + +TEST_F(homa_skb, homa_skb_cache_pages__different_numa_nodes) +{ + struct page *pages[4]; + int i; + + for (i = 0; i < 4; i++) + pages[i] = alloc_pages(GFP_KERNEL, HOMA_SKB_PAGE_ORDER); + mock_page_nid_mask = 7; + homa_skb_cache_pages(&self->homa, pages, 4); + EXPECT_EQ(1, self->homa.page_pools[0]->avail); + EXPECT_EQ(3, self->homa.page_pools[1]->avail); + EXPECT_EQ(pages[3], self->homa.page_pools[0]->pages[0]); + EXPECT_EQ(pages[1], self->homa.page_pools[1]->pages[1]); +} +TEST_F(homa_skb, homa_skb_cache_pages__pool_size_exceeded) +{ + struct page *pages[6]; + int i; + + for (i = 0; i < 6; i++) + pages[i] = alloc_pages(GFP_KERNEL, HOMA_SKB_PAGE_ORDER); + homa_skb_cache_pages(&self->homa, pages, 4); + EXPECT_EQ(4, self->homa.page_pools[0]->avail); + put_page(pages[4]); + put_page(pages[5]); +} + +TEST_F(homa_skb, homa_skb_get) +{ + struct sk_buff *skb = test_skb(&self->homa); + int32_t data[500]; + + /* Data is entirely in the head. */ + memset(data, 0, sizeof(data)); + homa_skb_get(skb, data, 20, 40); + EXPECT_EQ(1000020, data[0]); + EXPECT_EQ(1000056, data[9]); + EXPECT_EQ(0, data[10]); + + /* Data spans head and first frag. */ + memset(data, 0, sizeof(data)); + homa_skb_get(skb, data, 80, 60); + EXPECT_EQ(1000080, data[0]); + EXPECT_EQ(1000096, data[4]); + EXPECT_EQ(1000100, data[5]); + EXPECT_EQ(1000136, data[14]); + EXPECT_EQ(0, data[15]); + + /* Data spans 3 frags. */ + memset(data, 0, sizeof(data)); + homa_skb_get(skb, data, 280, 500); + EXPECT_EQ(1000280, data[0]); + EXPECT_EQ(1000296, data[4]); + EXPECT_EQ(1000300, data[5]); + EXPECT_EQ(1000596, data[79]); + EXPECT_EQ(1000600, data[80]); + EXPECT_EQ(1000776, data[124]); + EXPECT_EQ(0, data[125]); + + /* Data extends past end of skb. */ + memset(data, 0, sizeof(data)); + homa_skb_get(skb, data, 960, 100); + EXPECT_EQ(1000960, data[0]); + EXPECT_EQ(1000996, data[9]); + EXPECT_EQ(0, data[10]); + + kfree_skb(skb); +} + +TEST_F(homa_skb, homa_skb_release_pages__basics) +{ + EXPECT_EQ(0UL, self->homa.skb_page_free_time); + mock_clock = 1000000; + self->homa.skb_page_free_time = 500000; + self->homa.skb_page_frees_per_sec = 10; + self->homa.skb_page_pool_min_kb = 0; + add_to_pool(&self->homa, 10, 0); + get_skb_core(0)->pool->low_mark = 7; + add_to_pool(&self->homa, 3, 1); + get_skb_core(1)->pool->low_mark = 2; + + homa_skb_release_pages(&self->homa); + EXPECT_EQ(5, get_skb_core(0)->pool->avail); + EXPECT_EQ(3, get_skb_core(1)->pool->avail); + EXPECT_EQ(501000000UL, self->homa.skb_page_free_time); +} +TEST_F(homa_skb, homa_skb_release_pages__not_time_to_free) +{ + EXPECT_EQ(0UL, self->homa.skb_page_free_time); + mock_clock = 1000000; + self->homa.skb_page_free_time = 1000001; + self->homa.skb_page_frees_per_sec = 10; + self->homa.skb_page_pool_min_kb = 0; + add_to_pool(&self->homa, 10, 0); + get_skb_core(0)->pool->low_mark = 7; + homa_skb_release_pages(&self->homa); + EXPECT_EQ(10, get_skb_core(0)->pool->avail); +} +TEST_F(homa_skb, homa_skb_release_pages__allocate_skb_pages_to_free) +{ + EXPECT_EQ(0, self->homa.pages_to_free_slots); + mock_clock= 1000000; + self->homa.skb_page_frees_per_sec = 10; + self->homa.skb_page_free_time = 500000; + + /* First call: no current allocation. */ + homa_skb_release_pages(&self->homa); + EXPECT_EQ(5, self->homa.pages_to_free_slots); + + /* Second call: free current allocation. */ + self->homa.pages_to_free_slots -= 1; + self->homa.skb_page_free_time = 500000; + homa_skb_release_pages(&self->homa); + EXPECT_EQ(5, self->homa.pages_to_free_slots); +} +TEST_F(homa_skb, homa_skb_release_pages__cant_reallocate_skb_pages_to_free) +{ + struct homa_page_pool *pool; + + EXPECT_EQ(0UL, self->homa.skb_page_free_time); + mock_clock = 1000000; + self->homa.skb_page_free_time = 500000; + self->homa.skb_page_frees_per_sec = 20; + self->homa.skb_page_pool_min_kb = 0; + add_to_pool(&self->homa, 20, 0); + pool = get_skb_core(0)->pool; + pool->low_mark = 15; + + EXPECT_EQ(0, self->homa.pages_to_free_slots); + self->homa.skb_pages_to_free = kmalloc_array(4, sizeof(struct page *), + GFP_ATOMIC); + self->homa.pages_to_free_slots = 4; + + mock_kmalloc_errors = 1; + homa_skb_release_pages(&self->homa); + EXPECT_EQ(16, get_skb_core(0)->pool->avail); + EXPECT_EQ(4, self->homa.pages_to_free_slots); +} +TEST_F(homa_skb, homa_skb_release_pages__limited_by_min_kb) +{ + EXPECT_EQ(0UL, self->homa.skb_page_free_time); + mock_clock = 1000000; + self->homa.skb_page_free_time = 500000; + self->homa.skb_page_frees_per_sec = 20; + self->homa.skb_page_pool_min_kb = (5 * HOMA_SKB_PAGE_SIZE) / 1000; + add_to_pool(&self->homa, 10, 0); + get_skb_core(0)->pool->low_mark = 9; + + homa_skb_release_pages(&self->homa); + EXPECT_EQ(6, get_skb_core(0)->pool->avail); +} +TEST_F(homa_skb, homa_skb_release_pages__empty_pool) +{ + EXPECT_EQ(0UL, self->homa.skb_page_free_time); + mock_clock= 2000000; + self->homa.skb_page_free_time = 500000; + self->homa.skb_page_frees_per_sec = 1000; + self->homa.skb_page_pool_min_kb = 0; + add_to_pool(&self->homa, 5, 0); + get_skb_core(0)->pool->low_mark = 5; + + homa_skb_release_pages(&self->homa); + EXPECT_EQ(0, get_skb_core(0)->pool->avail); +} diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c new file mode 100644 index 00000000..dd83df82 --- /dev/null +++ b/test/unit_homa_sock.c @@ -0,0 +1,479 @@ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +#include "homa_impl.h" +#include "homa_interest.h" +#include "homa_sock.h" +#define KSELFTEST_NOT_MAIN 1 +#include "kselftest_harness.h" +#include "ccutils.h" +#include "mock.h" +#include "utils.h" + +#define n(x) htons(x) +#define N(x) htonl(x) + +struct homa_sock *hook_hsk; +static int hook_count; +static void schedule_hook(char *id) +{ + if (strcmp(id, "schedule_timeout") != 0) + return; + if (hook_count <= 0) + return; + hook_count--; + if (hook_count != 0) + return; + hook_hsk->sock.sk_sndbuf = refcount_read(&hook_hsk->sock.sk_wmem_alloc) + + 100; +} + +FIXTURE(homa_sock) { + struct homa homa; + struct homa_net *hnet; + struct homa_sock hsk; + struct in6_addr client_ip[1]; + int client_port; + struct in6_addr server_ip[1]; + int server_port; + u64 client_id; +}; +FIXTURE_SETUP(homa_sock) +{ + homa_init(&self->homa); + self->hnet = mock_hnet(0, &self->homa); + mock_sock_init(&self->hsk, self->hnet, 0); + self->client_ip[0] = unit_get_in_addr("196.168.0.1"); + self->client_port = 40000; + self->server_ip[0] = unit_get_in_addr("1.2.3.4"); + self->server_port = 99; + self->client_id = 1234; + unit_log_clear(); +} +FIXTURE_TEARDOWN(homa_sock) +{ + homa_destroy(&self->homa); + unit_teardown(); +} + +TEST_F(homa_sock, homa_socktab_destroy) +{ + struct homa_sock hsk1, hsk2, hsk3; + struct homa_net *hnet; + + hnet = mock_hnet(1, &self->homa); + mock_sock_init(&hsk1, hnet, 100); + mock_sock_init(&hsk2, hnet, 101); + mock_sock_init(&hsk3, self->hnet, 100); + EXPECT_EQ(0, hsk1.shutdown); + EXPECT_EQ(0, hsk2.shutdown); + EXPECT_EQ(0, hsk3.shutdown); + + homa_socktab_destroy(self->homa.socktab, hnet); + EXPECT_EQ(1, hsk1.shutdown); + EXPECT_EQ(1, hsk2.shutdown); + EXPECT_EQ(0, hsk3.shutdown); + + homa_socktab_destroy(self->homa.socktab, NULL); + EXPECT_EQ(1, hsk3.shutdown); +} + +TEST_F(homa_sock, homa_socktab_start_scan) +{ + struct homa_socktab_scan scan; + + homa_destroy(&self->homa); + homa_init(&self->homa); + mock_sock_init(&self->hsk, self->hnet, HOMA_MIN_DEFAULT_PORT+100); + EXPECT_EQ(&self->hsk, homa_socktab_start_scan(self->homa.socktab, + &scan)); + EXPECT_EQ(100, scan.current_bucket); + EXPECT_EQ(1, mock_sock_holds); + homa_socktab_end_scan(&scan); +} + +TEST_F(homa_sock, homa_socktab_next) +{ + struct homa_sock hsk1, hsk2, hsk3, hsk4, *hsk; + struct homa_socktab_scan scan; + int first_port = 34000; + + homa_destroy(&self->homa); + homa_init(&self->homa); + mock_sock_init(&hsk1, self->hnet, first_port); + mock_sock_init(&hsk2, self->hnet, first_port+HOMA_SOCKTAB_BUCKETS); + mock_sock_init(&hsk3, self->hnet, first_port+2*HOMA_SOCKTAB_BUCKETS); + mock_sock_init(&hsk4, self->hnet, first_port+5); + hsk = homa_socktab_start_scan(self->homa.socktab, &scan); + EXPECT_EQ(first_port+2*HOMA_SOCKTAB_BUCKETS, hsk->port); + EXPECT_EQ(1, mock_sock_holds); + hsk = homa_socktab_next(&scan); + EXPECT_EQ(first_port+HOMA_SOCKTAB_BUCKETS, hsk->port); + EXPECT_EQ(1, mock_sock_holds); + hsk = homa_socktab_next(&scan); + EXPECT_EQ(first_port, hsk->port); + EXPECT_EQ(1, mock_sock_holds); + hsk = homa_socktab_next(&scan); + EXPECT_EQ(first_port+5, hsk->port); + EXPECT_EQ(1, mock_sock_holds); + hsk = homa_socktab_next(&scan); + EXPECT_EQ(NULL, hsk); + EXPECT_EQ(0, mock_sock_holds); + unit_sock_destroy(&hsk1); + unit_sock_destroy(&hsk2); + unit_sock_destroy(&hsk3); + unit_sock_destroy(&hsk4); + homa_socktab_end_scan(&scan); +} + +TEST_F(homa_sock, homa_socktab_end_scan) +{ + struct homa_socktab_scan scan1, scan2, scan3; + + homa_destroy(&self->homa); + homa_init(&self->homa); + mock_sock_init(&self->hsk, self->hnet, HOMA_MIN_DEFAULT_PORT+100); + homa_socktab_start_scan(self->homa.socktab, &scan1); + homa_socktab_start_scan(self->homa.socktab, &scan2); + homa_socktab_start_scan(self->homa.socktab, &scan3); + EXPECT_EQ(3, mock_sock_holds); + homa_socktab_next(&scan2); + EXPECT_EQ(2, mock_sock_holds); + homa_socktab_end_scan(&scan1); + EXPECT_EQ(1, mock_sock_holds); + homa_socktab_end_scan(&scan2); + EXPECT_EQ(1, mock_sock_holds); + homa_socktab_end_scan(&scan3); + EXPECT_EQ(0, mock_sock_holds); +} + +TEST_F(homa_sock, homa_sock_init__cant_allocate_buffer_pool) +{ + struct homa_sock sock; + + mock_kmalloc_errors = 1; + EXPECT_EQ(ENOMEM, -homa_sock_init(&sock)); + unit_sock_destroy(&sock); +} +TEST_F(homa_sock, homa_sock_init__skip_port_in_use) +{ + struct homa_sock hsk2, hsk3; + + self->hnet->prev_default_port = 0xfffe; + mock_sock_init(&hsk2, self->hnet, 0); + mock_sock_init(&hsk3, self->hnet, 0); + EXPECT_EQ(65535, hsk2.port); + EXPECT_EQ(32769, hsk3.port); + unit_sock_destroy(&hsk2); + unit_sock_destroy(&hsk3); +} +TEST_F(homa_sock, homa_sock_init__all_ports_in_use) +{ + struct homa_sock hsk2, hsk3, hsk4; + + mock_min_default_port = -2; + EXPECT_EQ(0, -mock_sock_init(&hsk2, self->hnet, 0)); + EXPECT_EQ(0, -mock_sock_init(&hsk3, self->hnet, 0)); + EXPECT_EQ(EADDRNOTAVAIL, -mock_sock_init(&hsk4, self->hnet, 0)); + EXPECT_EQ(65534, hsk2.port); + EXPECT_EQ(65535, hsk3.port); + EXPECT_EQ(1, hsk4.shutdown); + unit_sock_destroy(&hsk2); + unit_sock_destroy(&hsk3); + unit_sock_destroy(&hsk4); +} +TEST_F(homa_sock, homa_sock_init__ip_header_length) +{ + struct homa_sock hsk_v4, hsk_v6; + + mock_ipv6 = false; + mock_sock_init(&hsk_v4, self->hnet, 0); + mock_ipv6 = true; + mock_sock_init(&hsk_v6, self->hnet, 0); + EXPECT_EQ(sizeof(struct iphdr), hsk_v4.ip_header_length); + EXPECT_EQ(sizeof(struct ipv6hdr), hsk_v6.ip_header_length); + unit_sock_destroy(&hsk_v4); + unit_sock_destroy(&hsk_v6); +} +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_sock, homa_sock_init__hijack_tcp) +{ + struct homa_sock hijack, no_hijack; + + self->homa.hijack_tcp = 0; + mock_sock_init(&no_hijack, self->hnet, 0); + self->homa.hijack_tcp = 1; + mock_sock_init(&hijack, self->hnet, 0); + EXPECT_EQ(0, no_hijack.sock.sk_protocol); + EXPECT_EQ(IPPROTO_TCP, hijack.sock.sk_protocol); + unit_sock_destroy(&hijack); + unit_sock_destroy(&no_hijack); +} +#endif /* See strip.py */ + +TEST_F(homa_sock, homa_sock_unlink__remove_from_map) +{ + struct homa_sock hsk2, hsk3; + int client2, client3; + + mock_sock_init(&hsk2, self->hnet, 0); + EXPECT_EQ(0, homa_sock_bind(self->hnet, &hsk2, 100)); + client2 = hsk2.port; + mock_sock_init(&hsk3, self->hnet, 0); + client3 = hsk3.port; + + EXPECT_EQ(&hsk2, homa_sock_find(self->hnet, client2)); + EXPECT_EQ(&hsk3, homa_sock_find(self->hnet, client3)); + sock_put(&hsk2.sock); + sock_put(&hsk3.sock); + + unit_sock_destroy(&hsk2); + + EXPECT_EQ(NULL, homa_sock_find(self->hnet, client2)); + EXPECT_EQ(&hsk3, homa_sock_find(self->hnet, client3)); + sock_put(&hsk3.sock); + + unit_sock_destroy(&hsk3); + + EXPECT_EQ(NULL, homa_sock_find(self->hnet, client2)); + EXPECT_EQ(NULL, homa_sock_find(self->hnet, client3)); +} + +TEST_F(homa_sock, homa_sock_shutdown__unlink_socket) +{ + struct homa_sock hsk; + int client; + + mock_sock_init(&hsk, self->hnet, 0); + EXPECT_EQ(0, homa_sock_bind(self->hnet, &hsk, 100)); + client = hsk.port; + EXPECT_EQ(&hsk, homa_sock_find(self->hnet, client)); + sock_put(&hsk.sock); + + homa_sock_shutdown(&hsk); + EXPECT_EQ(NULL, homa_sock_find(self->hnet, client)); + homa_sock_destroy(&hsk.sock); +} +TEST_F(homa_sock, homa_sock_shutdown__already_shutdown) +{ + unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 20000, 1600); + unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id+2, + 5000, 5000); + self->hsk.shutdown = 1; + homa_sock_shutdown(&self->hsk); + EXPECT_TRUE(self->hsk.shutdown); + EXPECT_EQ(2, unit_list_length(&self->hsk.active_rpcs)); + self->hsk.shutdown = 0; +} +TEST_F(homa_sock, homa_sock_shutdown__delete_rpcs) +{ + unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 20000, 1600); + unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id+2, + 5000, 5000); + homa_sock_shutdown(&self->hsk); + EXPECT_TRUE(self->hsk.shutdown); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); + homa_sock_destroy(&self->hsk.sock); +} +TEST_F(homa_sock, homa_sock_shutdown__wakeup_interests) +{ + struct homa_interest interest1, interest2; + + mock_log_wakeups = 1; + homa_interest_init_shared(&interest1, &self->hsk); + homa_interest_init_shared(&interest2, &self->hsk); + unit_log_clear(); + + homa_sock_shutdown(&self->hsk); + EXPECT_TRUE(self->hsk.shutdown); + EXPECT_EQ(1, atomic_read(&interest1.ready)); + EXPECT_EQ(1, atomic_read(&interest2.ready)); + EXPECT_EQ(NULL, interest1.rpc); + EXPECT_EQ(NULL, interest2.rpc); + EXPECT_TRUE(list_empty(&interest1.links)); + EXPECT_STREQ("wake_up; wake_up", unit_log_get()); + homa_sock_destroy(&self->hsk.sock); +} + +TEST_F(homa_sock, homa_sock_bind) +{ + struct homa_sock hsk2; + + mock_sock_init(&hsk2, self->hnet, 0); + EXPECT_EQ(0, homa_sock_bind(self->hnet, &hsk2, 100)); + + EXPECT_EQ(0, -homa_sock_bind(self->hnet, &self->hsk, 0)); + EXPECT_EQ(HOMA_MIN_DEFAULT_PORT, self->hsk.port); + EXPECT_EQ(EINVAL, -homa_sock_bind(self->hnet, &self->hsk, + HOMA_MIN_DEFAULT_PORT + 100)); + EXPECT_STREQ("port number invalid: in the automatically assigned range", + self->hsk.error_msg); + + EXPECT_EQ(EADDRINUSE, -homa_sock_bind(self->hnet, &self->hsk, 100)); + EXPECT_STREQ("requested port number is already in use", + self->hsk.error_msg); + EXPECT_EQ(0, -homa_sock_bind(self->hnet, &hsk2, 100)); + + EXPECT_EQ(0, -homa_sock_bind(self->hnet, &self->hsk, 110)); + + EXPECT_EQ(&self->hsk, homa_sock_find(self->hnet, 110)); + sock_put(&self->hsk.sock); + EXPECT_EQ(0, -homa_sock_bind(self->hnet, &self->hsk, 120)); + EXPECT_EQ(NULL, homa_sock_find(self->hnet, 110)); + EXPECT_EQ(&self->hsk, homa_sock_find(self->hnet, 120)); + sock_put(&self->hsk.sock); + unit_sock_destroy(&hsk2); +} +TEST_F(homa_sock, homa_sock_bind__socket_shutdown) +{ + unit_sock_destroy(&self->hsk); + EXPECT_EQ(ESHUTDOWN, -homa_sock_bind(self->hnet, &self->hsk, 100)); + EXPECT_STREQ("socket has been shut down", self->hsk.error_msg); +} + +TEST_F(homa_sock, homa_sock_find__basics) +{ + struct homa_sock hsk2; + + mock_sock_init(&hsk2, self->hnet, 0); + EXPECT_EQ(0, homa_sock_bind(self->hnet, &hsk2, 100)); + EXPECT_EQ(&self->hsk, homa_sock_find(self->hnet, self->hsk.port)); + sock_put(&self->hsk.sock); + EXPECT_EQ(&hsk2, homa_sock_find(self->hnet, hsk2.port)); + sock_put(&hsk2.sock); + EXPECT_EQ(NULL, homa_sock_find(self->hnet, hsk2.port + 1)); + unit_sock_destroy(&hsk2); +} +TEST_F(homa_sock, homa_sock_find__same_port_in_different_hnets) +{ + struct homa_sock hsk1, hsk2; + struct homa_sock *hsk; + struct homa_net *hnet; + + hnet = mock_hnet(1, &self->homa); + mock_sock_init(&hsk1, self->hnet, 100); + mock_sock_init(&hsk2, hnet, 100); + + hsk = homa_sock_find(self->hnet, 100); + EXPECT_EQ(&hsk1, hsk); + hsk = homa_sock_find(hnet, 100); + EXPECT_EQ(&hsk2, hsk); + + sock_put(&hsk1.sock); + sock_put(&hsk2.sock); + unit_sock_destroy(&hsk1); + unit_sock_destroy(&hsk2); +} + +TEST_F(homa_sock, homa_sock_find__long_hash_chain) +{ + struct homa_sock hsk2, hsk3, hsk4; + + EXPECT_EQ(0, homa_sock_bind(self->hnet, &self->hsk, 13)); + mock_sock_init(&hsk2, self->hnet, 0); + EXPECT_EQ(0, homa_sock_bind(self->hnet, &hsk2, + 2*HOMA_SOCKTAB_BUCKETS + 13)); + mock_sock_init(&hsk3, self->hnet, 0); + EXPECT_EQ(0, homa_sock_bind(self->hnet, &hsk3, + 3*HOMA_SOCKTAB_BUCKETS + 13)); + mock_sock_init(&hsk4, self->hnet, 0); + EXPECT_EQ(0, homa_sock_bind(self->hnet, &hsk4, + 5*HOMA_SOCKTAB_BUCKETS + 13)); + + EXPECT_EQ(&self->hsk, homa_sock_find(self->hnet, 13)); + sock_put(&self->hsk.sock); + EXPECT_EQ(&hsk2, homa_sock_find(self->hnet, 2*HOMA_SOCKTAB_BUCKETS + 13)); + sock_put(&hsk2.sock); + EXPECT_EQ(&hsk3, homa_sock_find(self->hnet, + 3*HOMA_SOCKTAB_BUCKETS + 13)); + sock_put(&hsk3.sock); + EXPECT_EQ(&hsk4, homa_sock_find(self->hnet, + 5*HOMA_SOCKTAB_BUCKETS + 13)); + sock_put(&hsk4.sock); + + unit_sock_destroy(&hsk2); + unit_sock_destroy(&hsk3); + unit_sock_destroy(&hsk4); +} + +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_sock, homa_sock_lock_slow) +{ + mock_clock_tick = 100; + + homa_sock_lock(&self->hsk); + EXPECT_EQ(0, homa_metrics_per_cpu()->socket_lock_misses); + EXPECT_EQ(0, homa_metrics_per_cpu()->socket_lock_miss_cycles); + homa_sock_unlock(&self->hsk); + + mock_trylock_errors = 1; + homa_sock_lock(&self->hsk); + EXPECT_EQ(1, homa_metrics_per_cpu()->socket_lock_misses); + EXPECT_EQ(100, homa_metrics_per_cpu()->socket_lock_miss_cycles); + homa_sock_unlock(&self->hsk); +} +#endif /* See strip.py */ + +TEST_F(homa_sock, homa_sock_wait_wmem__no_memory_shortage) +{ + EXPECT_EQ(0, -homa_sock_wait_wmem(&self->hsk, 1)); + EXPECT_EQ(1, test_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags)); +} +TEST_F(homa_sock, homa_sock_wait_wmem__nonblocking) +{ + self->hsk.sock.sk_sndbuf = 0; + EXPECT_EQ(EWOULDBLOCK, -homa_sock_wait_wmem(&self->hsk, 1)); + EXPECT_EQ(1, test_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags)); +} +TEST_F(homa_sock, homa_sock_wait_wmem__thread_blocks_then_wakes) +{ + self->hsk.sock.sk_sndbuf = 0; + self->hsk.sock.sk_sndtimeo = 6; + hook_hsk = &self->hsk; + hook_count = 5; + unit_hook_register(schedule_hook); + + EXPECT_EQ(0, -homa_sock_wait_wmem(&self->hsk, 0)); + EXPECT_EQ(1, test_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags)); +} +TEST_F(homa_sock, homa_sock_wait_wmem__thread_blocks_but_times_out) +{ + self->hsk.sock.sk_sndbuf = 0; + self->hsk.sock.sk_sndtimeo = 4; + hook_hsk = &self->hsk; + hook_count = 5; + unit_hook_register(schedule_hook); + + EXPECT_EQ(EWOULDBLOCK, -homa_sock_wait_wmem(&self->hsk, 0)); +} +TEST_F(homa_sock, homa_sock_wait_wmem__interrupted_by_signal) +{ + self->hsk.sock.sk_sndbuf = 0; + mock_prepare_to_wait_errors = 1; + mock_signal_pending = 1; + + EXPECT_EQ(EINTR, -homa_sock_wait_wmem(&self->hsk, 0)); +} + +TEST_F(homa_sock, homa_sock_wakeup_wmem) +{ + self->hsk.sock.sk_sndbuf = 0; + set_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags); + + /* First call: no memory available. */ + homa_sock_wakeup_wmem(&self->hsk); + EXPECT_EQ(1, test_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags)); + + /* Second call: memory now available. */ + self->hsk.sock.sk_sndbuf = 1000000; + mock_log_wakeups = 1; + unit_log_clear(); + homa_sock_wakeup_wmem(&self->hsk); + EXPECT_EQ(0, test_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags)); + EXPECT_STREQ("wake_up", unit_log_get()); +} diff --git a/test/unit_homa_socktab.c b/test/unit_homa_socktab.c deleted file mode 100644 index 32f88b57..00000000 --- a/test/unit_homa_socktab.c +++ /dev/null @@ -1,306 +0,0 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include "homa_impl.h" -#define KSELFTEST_NOT_MAIN 1 -#include "kselftest_harness.h" -#include "ccutils.h" -#include "mock.h" -#include "utils.h" - -#define n(x) htons(x) -#define N(x) htonl(x) - -FIXTURE(homa_socktab) { - struct homa homa; - struct homa_sock hsk; - struct in6_addr client_ip[1]; - int client_port; - struct in6_addr server_ip[1]; - int server_port; - __u64 client_id; -}; -FIXTURE_SETUP(homa_socktab) -{ - homa_init(&self->homa); - mock_sock_init(&self->hsk, &self->homa, 0); - self->client_ip[0] = unit_get_in_addr("196.168.0.1"); - self->client_port = 40000; - self->server_ip[0] = unit_get_in_addr("1.2.3.4"); - self->server_port = 99; - self->client_id = 1234; -} -FIXTURE_TEARDOWN(homa_socktab) -{ - homa_destroy(&self->homa); - unit_teardown(); -} - -TEST_F(homa_socktab, homa_port_hash) -{ - EXPECT_EQ(1023, homa_port_hash(0xffff)); - EXPECT_EQ(18, homa_port_hash(0x6012)); - EXPECT_EQ(99, homa_port_hash(99)); -} - -TEST_F(homa_socktab, homa_socktab_start_scan) -{ - struct homa_socktab_scan scan; - homa_destroy(&self->homa); - homa_init(&self->homa); - mock_sock_init(&self->hsk, &self->homa, HOMA_MIN_DEFAULT_PORT+100); - EXPECT_EQ(&self->hsk, homa_socktab_start_scan(&self->homa.port_map, - &scan)); - EXPECT_EQ(100, scan.current_bucket); -} - -TEST_F(homa_socktab, homa_socktab_next__basics) -{ - struct homa_sock hsk1, hsk2, hsk3, hsk4, *hsk; - struct homa_socktab_scan scan; - int first_port = 34000; - homa_destroy(&self->homa); - homa_init(&self->homa); - mock_sock_init(&hsk1, &self->homa, first_port); - mock_sock_init(&hsk2, &self->homa, first_port+HOMA_SOCKTAB_BUCKETS); - mock_sock_init(&hsk3, &self->homa, first_port+2*HOMA_SOCKTAB_BUCKETS); - mock_sock_init(&hsk4, &self->homa, first_port+5); - hsk = homa_socktab_start_scan(&self->homa.port_map, &scan); - EXPECT_EQ(first_port+2*HOMA_SOCKTAB_BUCKETS, hsk->port); - hsk = homa_socktab_next(&scan); - EXPECT_EQ(first_port+HOMA_SOCKTAB_BUCKETS, hsk->port); - hsk = homa_socktab_next(&scan); - EXPECT_EQ(first_port, hsk->port); - hsk = homa_socktab_next(&scan); - EXPECT_EQ(first_port+5, hsk->port); - hsk = homa_socktab_next(&scan); - EXPECT_EQ(NULL, hsk); - homa_sock_destroy(&hsk1); - homa_sock_destroy(&hsk2); - homa_sock_destroy(&hsk3); - homa_sock_destroy(&hsk4); -} -TEST_F(homa_socktab, homa_socktab_next__deleted_socket) -{ - struct homa_sock hsk1, hsk2, hsk3, *hsk; - struct homa_socktab_scan scan; - int first_port = 34000; - homa_destroy(&self->homa); - homa_init(&self->homa); - mock_sock_init(&hsk1, &self->homa, first_port); - mock_sock_init(&hsk2, &self->homa, first_port+HOMA_SOCKTAB_BUCKETS); - mock_sock_init(&hsk3, &self->homa, first_port+2*HOMA_SOCKTAB_BUCKETS); - hsk = homa_socktab_start_scan(&self->homa.port_map, &scan); - EXPECT_EQ(first_port+2*HOMA_SOCKTAB_BUCKETS, hsk->port); - homa_sock_destroy(&hsk2); - hsk = homa_socktab_next(&scan); - EXPECT_EQ(first_port+HOMA_SOCKTAB_BUCKETS, hsk->port); - EXPECT_EQ(1, hsk->shutdown); - hsk = homa_socktab_next(&scan); - EXPECT_EQ(first_port, hsk->port); - hsk = homa_socktab_next(&scan); - EXPECT_EQ(NULL, hsk); - homa_sock_destroy(&hsk1); - homa_sock_destroy(&hsk3); -} - -TEST_F(homa_socktab, homa_sock_init__skip_port_in_use) -{ - struct homa_sock hsk2, hsk3; - self->homa.next_client_port = 0xffff; - mock_sock_init(&hsk2, &self->homa, 0); - mock_sock_init(&hsk3, &self->homa, 0); - EXPECT_EQ(65535, hsk2.port); - EXPECT_EQ(32769, hsk3.port); - homa_sock_destroy(&hsk2); - homa_sock_destroy(&hsk3); -} -TEST_F(homa_socktab, homa_sock_init__ip_header_length) -{ - struct homa_sock hsk_v4, hsk_v6; - mock_ipv6 = false; - mock_sock_init(&hsk_v4, &self->homa, 0); - mock_ipv6 = true; - mock_sock_init(&hsk_v6, &self->homa, 0); - EXPECT_EQ(HOMA_IPV4_HEADER_LENGTH, hsk_v4.ip_header_length); - EXPECT_EQ(HOMA_IPV6_HEADER_LENGTH, hsk_v6.ip_header_length); - homa_sock_destroy(&hsk_v4); - homa_sock_destroy(&hsk_v6); -} - - -TEST_F(homa_socktab, homa_sock_shutdown__basics) -{ - int client2, client3; - struct homa_sock hsk2, hsk3; - mock_sock_init(&hsk2, &self->homa, 0); - EXPECT_EQ(0, homa_sock_bind(&self->homa.port_map, &hsk2, 100)); - client2 = hsk2.port; - mock_sock_init(&hsk3, &self->homa, 0); - client3 = hsk3.port; - - EXPECT_EQ(&hsk2, homa_sock_find(&self->homa.port_map, client2)); - EXPECT_EQ(&hsk2, homa_sock_find(&self->homa.port_map, 100)); - EXPECT_EQ(&hsk3, homa_sock_find(&self->homa.port_map, client3)); - - homa_sock_shutdown(&hsk2); - - EXPECT_EQ(NULL, homa_sock_find(&self->homa.port_map, client2)); - EXPECT_EQ(NULL, homa_sock_find(&self->homa.port_map, 100)); - EXPECT_EQ(&hsk3, homa_sock_find(&self->homa.port_map, client3)); - - homa_sock_shutdown(&hsk3); - - EXPECT_EQ(NULL, homa_sock_find(&self->homa.port_map, client2)); - EXPECT_EQ(NULL, homa_sock_find(&self->homa.port_map, 100)); - EXPECT_EQ(NULL, homa_sock_find(&self->homa.port_map, client3)); -} -TEST_F(homa_socktab, homa_sock_shutdown__already_shutdown) -{ - unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->server_port, self->client_id, - 20000, 1600); - unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, - self->server_ip, self->server_port, self->client_id+2, - 5000, 5000); - self->hsk.shutdown = 1; - homa_sock_shutdown(&self->hsk); - EXPECT_TRUE(self->hsk.shutdown); - EXPECT_EQ(2 ,unit_list_length(&self->hsk.active_rpcs)); - self->hsk.shutdown = 0; -} -TEST_F(homa_socktab, homa_sock_shutdown__delete_rpcs) -{ - unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->server_port, self->client_id, - 20000, 1600); - unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, - self->server_ip, self->server_port, self->client_id+2, - 5000, 5000); - homa_sock_shutdown(&self->hsk); - EXPECT_TRUE(self->hsk.shutdown); - EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); -} -TEST_F(homa_socktab, homa_sock_shutdown__wakeup_interests) -{ - struct homa_interest interest1, interest2, interest3; - struct task_struct task1, task2, task3; - interest1.thread = &task1; - task1.pid = 100; - interest2.thread = &task2; - task2.pid = 200; - interest3.thread = &task3; - task3.pid = 300; - EXPECT_FALSE(self->hsk.shutdown); - list_add_tail(&interest1.request_links, &self->hsk.request_interests); - list_add_tail(&interest2.request_links, &self->hsk.request_interests); - list_add_tail(&interest3.response_links, &self->hsk.response_interests); - homa_sock_shutdown(&self->hsk); - EXPECT_TRUE(self->hsk.shutdown); - EXPECT_STREQ("wake_up_process pid -1; wake_up_process pid 100; " - "wake_up_process pid 200; wake_up_process pid 300", - unit_log_get()); -} - -TEST_F(homa_socktab, homa_sock_bind) -{ - struct homa_sock hsk2; - mock_sock_init(&hsk2, &self->homa, 0); - EXPECT_EQ(0, homa_sock_bind(&self->homa.port_map, &hsk2, 100)); - - EXPECT_EQ(0, -homa_sock_bind(&self->homa.port_map, &self->hsk, 0)); - EXPECT_EQ(HOMA_MIN_DEFAULT_PORT, self->hsk.port); - EXPECT_EQ(EINVAL, -homa_sock_bind(&self->homa.port_map, &self->hsk, - HOMA_MIN_DEFAULT_PORT + 100)); - - EXPECT_EQ(EADDRINUSE, -homa_sock_bind(&self->homa.port_map, &self->hsk, - 100)); - EXPECT_EQ(0, -homa_sock_bind(&self->homa.port_map, &hsk2, - 100)); - - EXPECT_EQ(0, -homa_sock_bind(&self->homa.port_map, &self->hsk, - 110)); - - EXPECT_EQ(&self->hsk, homa_sock_find(&self->homa.port_map, 110)); - EXPECT_EQ(0, -homa_sock_bind(&self->homa.port_map, &self->hsk, - 120)); - EXPECT_EQ(NULL, homa_sock_find(&self->homa.port_map, 110)); - EXPECT_EQ(&self->hsk, homa_sock_find(&self->homa.port_map, 120)); - homa_sock_destroy(&hsk2); -} -TEST_F(homa_socktab, homa_sock_bind__socket_shutdown) -{ - homa_sock_shutdown(&self->hsk); - EXPECT_EQ(ESHUTDOWN, -homa_sock_bind(&self->homa.port_map, &self->hsk, - 100)); -} - -TEST_F(homa_socktab, homa_sock_find__basics) -{ - struct homa_sock hsk2; - mock_sock_init(&hsk2, &self->homa, 0); - EXPECT_EQ(0, homa_sock_bind(&self->homa.port_map, &hsk2, 100)); - EXPECT_EQ(&self->hsk, homa_sock_find(&self->homa.port_map, - self->hsk.port)); - EXPECT_EQ(&hsk2, homa_sock_find(&self->homa.port_map, - hsk2.port)); - EXPECT_EQ(NULL, homa_sock_find(&self->homa.port_map, - hsk2.port + 1)); - homa_sock_destroy(&hsk2); -} - -TEST_F(homa_socktab, homa_sock_find__long_hash_chain) -{ - struct homa_sock hsk2, hsk3, hsk4; - EXPECT_EQ(0, homa_sock_bind(&self->homa.port_map, &self->hsk, 13)); - mock_sock_init(&hsk2, &self->homa, 0); - EXPECT_EQ(0, homa_sock_bind(&self->homa.port_map, &hsk2, - 2*HOMA_SOCKTAB_BUCKETS + 13)); - mock_sock_init(&hsk3, &self->homa, 0); - EXPECT_EQ(0, homa_sock_bind(&self->homa.port_map, &hsk3, - 3*HOMA_SOCKTAB_BUCKETS + 13)); - mock_sock_init(&hsk4, &self->homa, 0); - EXPECT_EQ(0, homa_sock_bind(&self->homa.port_map, &hsk4, - 5*HOMA_SOCKTAB_BUCKETS + 13)); - - EXPECT_EQ(&self->hsk, homa_sock_find(&self->homa.port_map, - 13)); - EXPECT_EQ(&hsk2, homa_sock_find(&self->homa.port_map, - 2*HOMA_SOCKTAB_BUCKETS + 13)); - EXPECT_EQ(&hsk3, homa_sock_find(&self->homa.port_map, - 3*HOMA_SOCKTAB_BUCKETS + 13)); - EXPECT_EQ(&hsk4, homa_sock_find(&self->homa.port_map, - 5*HOMA_SOCKTAB_BUCKETS + 13)); - - homa_sock_destroy(&hsk2); - homa_sock_destroy(&hsk3); - homa_sock_destroy(&hsk4); -} - -TEST_F(homa_socktab, homa_sock_lock_slow) -{ - mock_cycles = ~0; - - homa_sock_lock(&self->hsk, "unit test"); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.socket_lock_misses); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.socket_lock_miss_cycles); - homa_sock_unlock(&self->hsk); - - mock_trylock_errors = 1; - homa_sock_lock(&self->hsk, "unit test"); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.socket_lock_misses); - EXPECT_NE(0, homa_cores[cpu_number]->metrics.socket_lock_miss_cycles); - homa_sock_unlock(&self->hsk); -} \ No newline at end of file diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index 5fe1154f..d578c32b 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -1,19 +1,9 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" +#include "homa_grant.h" +#include "homa_peer.h" +#include "homa_rpc.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" @@ -25,10 +15,11 @@ FIXTURE(homa_timer) { int client_port; struct in6_addr server_ip[1]; int server_port; - __u64 client_id; - __u64 server_id; - sockaddr_in_union server_addr; + u64 client_id; + u64 server_id; + union sockaddr_in_union server_addr; struct homa homa; + struct homa_net *hnet; struct homa_sock hsk; }; FIXTURE_SETUP(homa_timer) @@ -43,10 +34,15 @@ FIXTURE_SETUP(homa_timer) self->server_addr.in6.sin6_addr = *self->server_ip; self->server_addr.in6.sin6_port = htons(self->server_port); homa_init(&self->homa); + self->hnet = mock_hnet(0, &self->homa); self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.resend_ticks = 2; self->homa.timer_ticks = 100; - mock_sock_init(&self->hsk, &self->homa, 0); +#ifndef __STRIP__ /* See strip.py */ + self->homa.unsched_bytes = 10000; + self->homa.grant->window = 10000; +#endif /* See strip.py */ + mock_sock_init(&self->hsk, self->hnet, 0); unit_log_clear(); } FIXTURE_TEARDOWN(homa_timer) @@ -55,345 +51,269 @@ FIXTURE_TEARDOWN(homa_timer) unit_teardown(); } -TEST_F(homa_timer, homa_check_timeout__request_ack) +TEST_F(homa_timer, homa_timer_check_rpc__request_ack) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 100); + ASSERT_NE(NULL, srpc); self->homa.request_ack_ticks = 2; /* First call: do nothing (response not fully transmitted). */ - homa_check_rpc(srpc); + homa_rpc_lock(srpc); +#ifndef __STRIP__ /* See strip.py */ + homa_xmit_data(srpc, false); +#else /* See strip.py */ + homa_xmit_data(srpc); +#endif /* See strip.py */ + skb_get(srpc->msgout.packets); + homa_timer_check_rpc(srpc); EXPECT_EQ(0, srpc->done_timer_ticks); + kfree_skb(srpc->msgout.packets); /* Second call: set done_timer_ticks. */ - homa_xmit_data(srpc, false); unit_log_clear(); - homa_check_rpc(srpc); + homa_timer_check_rpc(srpc); EXPECT_EQ(100, srpc->done_timer_ticks); EXPECT_STREQ("", unit_log_get()); /* Third call: haven't hit request_ack_ticks yet. */ unit_log_clear(); self->homa.timer_ticks++; - homa_check_rpc(srpc); + homa_timer_check_rpc(srpc); EXPECT_EQ(100, srpc->done_timer_ticks); - EXPECT_EQ(self->homa.timer_ticks, srpc->resend_timer_ticks); EXPECT_STREQ("", unit_log_get()); /* Fourth call: request ack. */ unit_log_clear(); self->homa.timer_ticks++; - homa_check_rpc(srpc); + homa_timer_check_rpc(srpc); + homa_rpc_unlock(srpc); EXPECT_EQ(100, srpc->done_timer_ticks); - EXPECT_EQ(self->homa.timer_ticks, srpc->resend_timer_ticks); EXPECT_STREQ("xmit NEED_ACK", unit_log_get()); } -TEST_F(homa_timer, homa_check_timeout__client_rpc__granted_bytes_not_sent) +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_timer, homa_timer_check_rpc__all_granted_bytes_received) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 5000, 200); + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 100, 5000); + ASSERT_NE(NULL, crpc); unit_log_clear(); + crpc->msgin.granted = 1400; crpc->silent_ticks = 10; - EXPECT_EQ(0, homa_check_rpc(crpc)); + homa_rpc_lock(crpc); + homa_timer_check_rpc(crpc); + homa_rpc_unlock(crpc); EXPECT_EQ(0, crpc->silent_ticks); EXPECT_STREQ("", unit_log_get()); } -TEST_F(homa_timer, homa_check_timeout__all_granted_bytes_received) +#endif /* See strip.py */ +TEST_F(homa_timer, homa_timer_check_rpc__no_buffer_space) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, 100, 5000); + ASSERT_NE(NULL, crpc); unit_log_clear(); - crpc->msgin.incoming = 1400; + crpc->msgin.num_bpages = 0; crpc->silent_ticks = 10; - EXPECT_EQ(0, homa_check_rpc(crpc)); + homa_rpc_lock(crpc); + homa_timer_check_rpc(crpc); + homa_rpc_unlock(crpc); EXPECT_EQ(0, crpc->silent_ticks); EXPECT_STREQ("", unit_log_get()); } -TEST_F(homa_timer, homa_check_timeout__client_rpc__all_granted_bytes_received_no_busy) +TEST_F(homa_timer, homa_timer_check_rpc__server_has_received_request) { - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, + struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, - self->server_id, 5000, 5000); + self->server_id, 100, 100); + ASSERT_NE(NULL, srpc); unit_log_clear(); - srpc->msgin.incoming = 1400; srpc->silent_ticks = 10; - EXPECT_EQ(0, homa_check_rpc(srpc)); + homa_rpc_lock(srpc); + homa_timer_check_rpc(srpc); + homa_rpc_unlock(srpc); EXPECT_EQ(0, srpc->silent_ticks); EXPECT_STREQ("", unit_log_get()); } -TEST_F(homa_timer, homa_check_timeout__resend_ticks_not_reached) +TEST_F(homa_timer, homa_timer_check_rpc__granted_bytes_not_sent) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 50000, 200); + self->server_port, self->client_id, 5000, 200); + ASSERT_NE(NULL, crpc); + crpc->msgout.next_xmit_offset = 0; unit_log_clear(); - self->homa.resend_ticks = 3; - crpc->msgout.granted = 0; - crpc->peer->outstanding_resends = self->homa.timeout_resends + 10; - - /* First call: resend_ticks-1 not reached. */ - crpc->silent_ticks = 1; - EXPECT_EQ(0, homa_check_rpc(crpc)); - EXPECT_EQ(1, crpc->silent_ticks); + crpc->silent_ticks = 10; + homa_rpc_lock(crpc); + homa_timer_check_rpc(crpc); + homa_rpc_unlock(crpc); + EXPECT_EQ(0, crpc->silent_ticks); EXPECT_STREQ("", unit_log_get()); - - /* Second call: resend_ticks-1 reached. */ - crpc->silent_ticks = 2; - EXPECT_EQ(1, homa_check_rpc(crpc)); - EXPECT_EQ(2, crpc->silent_ticks); - EXPECT_EQ(0, crpc->peer->outstanding_resends); } -TEST_F(homa_timer, homa_check_timeout__peer_timeout) +TEST_F(homa_timer, homa_timer_check_rpc__timeout) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, 200, 10000); + ASSERT_NE(NULL, crpc); unit_log_clear(); - crpc->silent_ticks = self->homa.resend_ticks; - crpc->peer->outstanding_resends = self->homa.timeout_resends; - EXPECT_EQ(1, homa_check_rpc(crpc)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.peer_timeouts); - EXPECT_EQ(0, crpc->peer->outstanding_resends); -} -TEST_F(homa_timer, homa_check_timeout__server_rpc__state_not_incoming) -{ - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 20000); - ASSERT_NE(NULL, srpc); - unit_log_clear(); - srpc->silent_ticks = self->homa.resend_ticks; - srpc->msgout.granted = 0; - EXPECT_EQ(0, homa_check_rpc(srpc)); - EXPECT_EQ(self->homa.resend_ticks, srpc->silent_ticks); -} -TEST_F(homa_timer, homa_check_timeout__rollover_state_for_least_recent_rpc) -{ - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 20000); - ASSERT_NE(NULL, srpc); - unit_log_clear(); - srpc->msgout.granted = 0; - srpc->silent_ticks = self->homa.resend_ticks; - srpc->peer->least_recent_rpc = srpc; - srpc->peer->least_recent_ticks = 0; - srpc->peer->resend_rpc = NULL; - srpc->peer->current_ticks = self->homa.timer_ticks-1; - EXPECT_EQ(0, homa_check_rpc(srpc)); - EXPECT_EQ(srpc, srpc->peer->resend_rpc); - EXPECT_EQ(NULL, srpc->peer->least_recent_rpc); - EXPECT_EQ(self->homa.timer_ticks, srpc->peer->least_recent_ticks); - EXPECT_EQ(self->homa.timer_ticks, srpc->peer->current_ticks); -} -TEST_F(homa_timer, homa_check_timeout__compute_least_recent_rpc) -{ - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 20000); - struct homa_rpc *srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id+1, 100, 20000); - struct homa_rpc *srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id+2, 100, 20000); - ASSERT_NE(NULL, srpc); - ASSERT_NE(NULL, srpc2); - ASSERT_NE(NULL, srpc3); - unit_log_clear(); - srpc->msgout.granted = 0; - srpc->silent_ticks = self->homa.resend_ticks; - srpc->resend_timer_ticks = self->homa.timer_ticks - 5; - srpc2->msgout.granted = 0; - srpc2->silent_ticks = self->homa.resend_ticks; - srpc2->resend_timer_ticks = self->homa.timer_ticks - 10; - srpc3->msgout.granted = 0; - srpc3->silent_ticks = self->homa.resend_ticks; - srpc3->resend_timer_ticks = self->homa.timer_ticks - 3; - srpc->peer->current_ticks = self->homa.timer_ticks-1; - EXPECT_EQ(0, homa_check_rpc(srpc)); - EXPECT_EQ(srpc, srpc->peer->least_recent_rpc); - EXPECT_EQ(0, homa_check_rpc(srpc2)); - EXPECT_EQ(srpc2, srpc->peer->least_recent_rpc); - EXPECT_EQ(0, homa_check_rpc(srpc3)); - EXPECT_EQ(srpc2, srpc->peer->least_recent_rpc); - EXPECT_EQ(self->homa.timer_ticks - 10, srpc->peer->least_recent_ticks); - EXPECT_EQ(self->homa.timer_ticks, srpc->peer->current_ticks); -} -TEST_F(homa_timer, homa_check_timeout__least_recent_rpc_with_ticks_overflow) -{ - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 20000); - struct homa_rpc *srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id+1, 100, 20000); - struct homa_rpc *srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id+2, 100, 20000); - ASSERT_NE(NULL, srpc); - ASSERT_NE(NULL, srpc2); - ASSERT_NE(NULL, srpc3); - unit_log_clear(); - srpc->msgout.granted = 0; - srpc->silent_ticks = self->homa.resend_ticks; - srpc->resend_timer_ticks = 5; - srpc2->msgout.granted = 0; - srpc2->silent_ticks = self->homa.resend_ticks; - srpc2->resend_timer_ticks = -10; - srpc3->msgout.granted = 0; - srpc3->silent_ticks = self->homa.resend_ticks; - srpc3->resend_timer_ticks = 3; - srpc->peer->current_ticks = self->homa.timer_ticks-1; - EXPECT_EQ(0, homa_check_rpc(srpc)); - EXPECT_EQ(srpc, srpc->peer->least_recent_rpc); - EXPECT_EQ(0, homa_check_rpc(srpc2)); - EXPECT_EQ(srpc2, srpc->peer->least_recent_rpc); - EXPECT_EQ(0, homa_check_rpc(srpc3)); - EXPECT_EQ(srpc2, srpc->peer->least_recent_rpc); - EXPECT_EQ(-10, srpc->peer->least_recent_ticks); - EXPECT_EQ(self->homa.timer_ticks, srpc->peer->current_ticks); + crpc->silent_ticks = self->homa.timeout_ticks-1; + homa_rpc_lock(crpc); + homa_timer_check_rpc(crpc); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(0, homa_metrics_per_cpu()->rpc_timeouts); +#endif /* See strip.py */ + EXPECT_EQ(0, crpc->error); + crpc->silent_ticks = self->homa.timeout_ticks; + homa_timer_check_rpc(crpc); + homa_rpc_unlock(crpc); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(1, homa_metrics_per_cpu()->rpc_timeouts); +#endif /* See strip.py */ + EXPECT_EQ(ETIMEDOUT, -crpc->error); } -TEST_F(homa_timer, homa_check_timeout__too_soon_for_another_resend) +TEST_F(homa_timer, homa_timer_check_rpc__request_retransmission) { - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 5000, 5000); - ASSERT_NE(NULL, srpc); + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 200, 10000); - /* Send RESEND. */ + ASSERT_NE(NULL, crpc); + self->homa.resend_ticks = 3; + self->homa.resend_interval = 2; +#ifndef __STRIP__ /* See strip.py */ + crpc->msgin.granted = 5000; + crpc->msgout.granted = 0; +#endif /* See strip.py */ + + /* First call: resend_ticks-1. */ + crpc->silent_ticks = 2; unit_log_clear(); - srpc->silent_ticks = self->homa.resend_ticks; - srpc->peer->resend_rpc = srpc; - srpc->peer->most_recent_resend = self->homa.timer_ticks - - self->homa.resend_interval + 1; - EXPECT_EQ(0, homa_check_rpc(srpc)); + homa_timer_check_rpc(crpc); EXPECT_STREQ("", unit_log_get()); -} -TEST_F(homa_timer, homa_check_timeout__send_resend) -{ - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 5000, 5000); - ASSERT_NE(NULL, srpc); + /* Second call: resend_ticks. */ + crpc->silent_ticks = 3; unit_log_clear(); - srpc->silent_ticks = self->homa.resend_ticks-1; - srpc->resend_timer_ticks = self->homa.timer_ticks - 10; - srpc->peer->resend_rpc = srpc; + homa_rpc_lock(crpc); + homa_timer_check_rpc(crpc); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_STREQ("xmit RESEND 1400-4999@7", unit_log_get()); +#else /* See strip.py */ + EXPECT_STREQ("xmit RESEND 1400-9999", unit_log_get()); +#endif /* See strip.py */ - /* First call: no resend, but choose this RPC for least_recent_rpc. */ - EXPECT_EQ(0, homa_check_rpc(srpc)); + /* Third call: not yet time for next resend. */ + crpc->silent_ticks = 4; + unit_log_clear(); + homa_timer_check_rpc(crpc); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(0, srpc->peer->outstanding_resends); - EXPECT_EQ(srpc, srpc->peer->least_recent_rpc); - /* Second call: issue resend. */ - self->homa.timer_ticks++; - srpc->silent_ticks++; - EXPECT_EQ(0, homa_check_rpc(srpc)); + /* Fourth call: time for second resend. */ + crpc->silent_ticks = 5; + unit_log_clear(); + homa_timer_check_rpc(crpc); + homa_rpc_unlock(crpc); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit RESEND 1400-4999@7", unit_log_get()); - EXPECT_EQ(self->homa.timer_ticks, srpc->resend_timer_ticks); - EXPECT_EQ(self->homa.timer_ticks, srpc->peer->most_recent_resend); - EXPECT_EQ(1, srpc->peer->outstanding_resends); - EXPECT_EQ(NULL, srpc->peer->least_recent_rpc); +#else /* See strip.py */ + EXPECT_STREQ("xmit RESEND 1400-9999", unit_log_get()); +#endif /* See strip.py */ } TEST_F(homa_timer, homa_timer__basics) { - self->homa.timeout_resends = 2; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, 200, 5000); + ASSERT_NE(NULL, crpc); + self->homa.timeout_ticks = 5; + self->homa.resend_ticks = 3; + self->homa.resend_interval = 2; unit_log_clear(); + crpc->silent_ticks = 1; homa_timer(&self->homa); - EXPECT_EQ(1, crpc->silent_ticks); + EXPECT_EQ(2, crpc->silent_ticks); EXPECT_STREQ("", unit_log_get()); /* Send RESEND. */ unit_log_clear(); homa_timer(&self->homa); - EXPECT_EQ(2, crpc->silent_ticks); + EXPECT_EQ(3, crpc->silent_ticks); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit RESEND 1400-4999@7", unit_log_get()); +#else /* See strip.py */ + EXPECT_STREQ("xmit RESEND 1400-4999", unit_log_get()); +#endif /* See strip.py */ /* Don't send another RESEND (resend_interval not reached). */ unit_log_clear(); homa_timer(&self->homa); - EXPECT_EQ(3, crpc->silent_ticks); + EXPECT_EQ(4, crpc->silent_ticks); EXPECT_STREQ("", unit_log_get()); /* Timeout the peer. */ unit_log_clear(); +#ifndef __STRIP__ /* See strip.py */ crpc->peer->outstanding_resends = self->homa.timeout_resends; +#endif /* See strip.py */ homa_timer(&self->homa); - EXPECT_EQ(1, unit_list_length(&self->hsk.ready_responses)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.peer_timeouts); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(1, homa_metrics_per_cpu()->rpc_timeouts); +#endif /* See strip.py */ EXPECT_EQ(ETIMEDOUT, -crpc->error); } TEST_F(homa_timer, homa_timer__reap_dead_rpcs) { struct homa_rpc *dead = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 20000); + self->server_port, self->client_id, 40000, 1000); + ASSERT_NE(NULL, dead); - homa_rpc_free(dead); + homa_rpc_end(dead); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(31, self->hsk.dead_skbs); +#else /* See strip.py */ EXPECT_EQ(30, self->hsk.dead_skbs); +#endif /* See strip.py */ // First call to homa_timer: not enough dead skbs. - self->homa.dead_buffs_limit = 31; + self->homa.dead_buffs_limit = 32; homa_timer(&self->homa); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(31, self->hsk.dead_skbs); +#else /* See strip.py */ EXPECT_EQ(30, self->hsk.dead_skbs); +#endif /* See strip.py */ // Second call to homa_timer: must reap. self->homa.dead_buffs_limit = 15; homa_timer(&self->homa); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(11, self->hsk.dead_skbs); +#else /* See strip.py */ EXPECT_EQ(10, self->hsk.dead_skbs); -} -TEST_F(homa_timer, homa_timer__rpc_ready) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id, 5000, 200); - ASSERT_NE(NULL, crpc); - unit_log_clear(); - crpc->silent_ticks = 2; - homa_timer(&self->homa); - EXPECT_EQ(0, crpc->silent_ticks); - EXPECT_STREQ("", unit_log_get()); +#endif /* See strip.py */ } TEST_F(homa_timer, homa_timer__rpc_in_service) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, self->server_id, 5000, 5000); + ASSERT_NE(NULL, srpc); unit_log_clear(); homa_timer(&self->homa); EXPECT_EQ(0, srpc->silent_ticks); EXPECT_STREQ("", unit_log_get()); } -TEST_F(homa_timer, homa_timer__abort_server_rpc) -{ - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 5000, 5000); - ASSERT_NE(NULL, srpc); - unit_log_clear(); - srpc->silent_ticks = self->homa.resend_ticks-1; - srpc->peer->outstanding_resends = self->homa.timeout_resends; - srpc->msgout.granted = 0; - homa_timer(&self->homa); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.server_rpc_discards); - EXPECT_EQ(1, unit_list_length(&self->hsk.dead_rpcs)); - EXPECT_STREQ("homa_remove_from_grantable invoked", unit_log_get()); -} diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index e86b0568..f0dd830a 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -1,19 +1,8 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" +#include "homa_peer.h" +#include "homa_sock.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" @@ -24,45 +13,15 @@ #define N(x) htonl(x) FIXTURE(homa_utils) { - struct in6_addr client_ip[1]; - int client_port; - struct in6_addr server_ip[1]; - int server_port; - __u64 client_id; - __u64 server_id; struct homa homa; + struct homa_net *hnet; struct homa_sock hsk; - sockaddr_in_union server_addr; - struct data_header data; - struct homa_rpc *crpc; - struct iovec iovec; - struct iov_iter iter; }; FIXTURE_SETUP(homa_utils) { - self->client_ip[0] = unit_get_in_addr("196.168.0.1"); - self->client_port = 40000; - self->server_ip[0] = unit_get_in_addr("1.2.3.4"); - self->server_port = 99; - self->client_id = 1234; - self->server_id = 1235; - self->server_addr.in6.sin6_family = AF_INET; - self->server_addr.in6.sin6_addr = *self->server_ip; - self->server_addr.in6.sin6_port = htons(self->server_port); homa_init(&self->homa); - mock_sock_init(&self->hsk, &self->homa, 0); - self->data = (struct data_header){.common = { - .sport = htons(self->client_port), - .dport = htons(self->server_port), - .type = DATA, - .sender_id = self->client_id}, - .message_length = htonl(10000), - .incoming = htonl(10000), .cutoff_version = 0, - .retransmit = 0, - .seg = {.offset = 0, .ack = {0, 0, 0}}}; - self->iovec.iov_base = (void *) 2000; - self->iovec.iov_len = 10000; - iov_iter_init(&self->iter, WRITE, &self->iovec, 1, self->iovec.iov_len); + self->hnet = mock_hnet(0, &self->homa); + mock_sock_init(&self->hsk, self->hnet, 0); unit_log_clear(); } FIXTURE_TEARDOWN(homa_utils) @@ -71,6 +30,7 @@ FIXTURE_TEARDOWN(homa_utils) unit_teardown(); } +#ifndef __STRIP__ /* See strip.py */ /** * set_cutoffs() - A convenience method to allow all of the values in * homa->unsched_cutoffs to be set concisely. @@ -97,562 +57,126 @@ static void set_cutoffs(struct homa *homa, int c0, int c1, int c2, homa->unsched_cutoffs[7] = c7; } -/** - * dead_rpcs() - Logs the ids for all of the RPCS in hsk->dead_rpcs. - * @hsk: Homa socket to check for dead RPCs. - * - * Return: the contents of the unit test log. - */ -static const char *dead_rpcs(struct homa_sock *hsk) +TEST_F(homa_utils, homa_init__pacer_alloc_failure) { - struct homa_rpc *rpc; - list_for_each_entry_rcu(rpc, &hsk->dead_rpcs, dead_links) - UNIT_LOG(" ", "%llu", rpc->id); - return unit_log_get(); -} + struct homa homa2; -TEST_F(homa_utils, homa_rpc_new_client__normal) -{ - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, - &self->server_addr); - ASSERT_FALSE(IS_ERR(crpc)); - homa_rpc_free(crpc); - homa_rpc_unlock(crpc); -} -TEST_F(homa_utils, homa_rpc_new_client__malloc_error) -{ mock_kmalloc_errors = 1; - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, - &self->server_addr); - EXPECT_TRUE(IS_ERR(crpc)); - EXPECT_EQ(ENOMEM, -PTR_ERR(crpc)); -} -TEST_F(homa_utils, homa_rpc_new_client__route_error) -{ - mock_route_errors = 1; - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, - &self->server_addr); - EXPECT_TRUE(IS_ERR(crpc)); - EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(crpc)); + unit_log_clear(); + EXPECT_EQ(ENOMEM, -homa_init(&homa2)); + EXPECT_EQ(NULL, homa2.pacer); + homa_destroy(&homa2); } -TEST_F(homa_utils, homa_rpc_new_client__socket_shutdown) +TEST_F(homa_utils, homa_init__grant_alloc_failure) { - self->hsk.shutdown = 1; - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, - &self->server_addr); - EXPECT_TRUE(IS_ERR(crpc)); - EXPECT_EQ(ESHUTDOWN, -PTR_ERR(crpc)); - self->hsk.shutdown = 1; -} + struct homa homa2; -TEST_F(homa_utils, homa_rpc_new_server__normal) -{ - int incoming_delta = 0; - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data); - ASSERT_FALSE(IS_ERR(srpc)); - homa_rpc_unlock(srpc); - self->data.message_length = N(1600); - homa_data_pkt(mock_skb_new(self->client_ip, &self->data.common, - 1400, 0), srpc, NULL, &incoming_delta); - EXPECT_EQ(RPC_INCOMING, srpc->state); - EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); - homa_rpc_free(srpc); -} -TEST_F(homa_utils, homa_rpc_new_server__already_exists) -{ - struct homa_rpc *srpc1 = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data); - ASSERT_FALSE(IS_ERR(srpc1)); - homa_rpc_unlock(srpc1); - self->data.common.sender_id = cpu_to_be64( - be64_to_cpu(self->data.common.sender_id) - + 2*HOMA_SERVER_RPC_BUCKETS); - struct homa_rpc *srpc2 = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data); - ASSERT_FALSE(IS_ERR(srpc2)); - homa_rpc_unlock(srpc2); - EXPECT_NE(srpc2, srpc1); - self->data.common.sender_id = cpu_to_be64( - be64_to_cpu(self->data.common.sender_id) - - 2*HOMA_SERVER_RPC_BUCKETS); - struct homa_rpc *srpc3 = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data); - ASSERT_FALSE(IS_ERR(srpc3)); - homa_rpc_unlock(srpc3); - EXPECT_EQ(srpc3, srpc1); + mock_kmalloc_errors = 2; + unit_log_clear(); + EXPECT_EQ(ENOMEM, -homa_init(&homa2)); + EXPECT_EQ(NULL, homa2.grant); + homa_destroy(&homa2); } -TEST_F(homa_utils, homa_rpc_new_server__malloc_error) +#endif /* See strip.py */ +TEST_F(homa_utils, homa_init__peertab_alloc_failure) { + struct homa homa2; + +#ifndef __STRIP__ /* See strip.py */ + mock_kmalloc_errors = 4; +#else /* See strip.py */ mock_kmalloc_errors = 1; - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data); - EXPECT_TRUE(IS_ERR(srpc)); - EXPECT_EQ(ENOMEM, -PTR_ERR(srpc)); -} -TEST_F(homa_utils, homa_rpc_new_server__addr_error) -{ - mock_route_errors = 1; - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data); - EXPECT_TRUE(IS_ERR(srpc)); - EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(srpc)); -} -TEST_F(homa_utils, homa_rpc_new_server__socket_shutdown) -{ - self->hsk.shutdown = 1; - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data); - EXPECT_TRUE(IS_ERR(srpc)); - EXPECT_EQ(ESHUTDOWN, -PTR_ERR(srpc)); - EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); -} -TEST_F(homa_utils, homa_rpc_new_server__handoff_rpc) -{ - self->data.message_length = N(1400); - self->data.seg.segment_length = N(1400); - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data); - ASSERT_FALSE(IS_ERR(srpc)); - homa_rpc_unlock(srpc); - EXPECT_EQ(RPC_INCOMING, srpc->state); - EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(1, unit_list_length(&self->hsk.ready_requests)); - homa_rpc_free(srpc); +#endif/* See strip.py */ + unit_log_clear(); + EXPECT_EQ(ENOMEM, -homa_init(&homa2)); + EXPECT_EQ(NULL, homa2.peertab); + homa_destroy(&homa2); } -TEST_F(homa_utils, homa_rpc_new_server__dont_handoff_rpc) +TEST_F(homa_utils, homa_init__cant_allocate_port_map) { - self->data.message_length = N(2800); - self->data.seg.offset = N(1400); - self->data.seg.segment_length = N(1400); - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data); - ASSERT_FALSE(IS_ERR(srpc)); - homa_rpc_unlock(srpc); - EXPECT_EQ(RPC_INCOMING, srpc->state); - EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(0, unit_list_length(&self->hsk.ready_requests)); - homa_rpc_free(srpc); -} + struct homa homa2; -TEST_F(homa_utils, homa_rpc_lock_slow) +#ifndef __STRIP__ /* See strip.py */ + mock_kmalloc_errors = 0x20; +#else /* See strip.py */ + mock_kmalloc_errors = 4; +#endif/* See strip.py */ + unit_log_clear(); + EXPECT_EQ(ENOMEM, -homa_init(&homa2)); + EXPECT_EQ(NULL, homa2.socktab); + homa_destroy(&homa2); +} +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_utils, homa_init__homa_skb_init_failure) { - mock_cycles = ~0; - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, - &self->server_addr); - ASSERT_FALSE(IS_ERR(crpc)); - homa_rpc_free(crpc); - homa_rpc_unlock(crpc); - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data); - ASSERT_FALSE(IS_ERR(srpc)); - homa_rpc_unlock(srpc); + struct homa homa2; - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.client_lock_misses); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.client_lock_miss_cycles); - homa_rpc_lock_slow(crpc); - homa_rpc_unlock(crpc); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.client_lock_misses); - EXPECT_NE(0, homa_cores[cpu_number]->metrics.client_lock_miss_cycles); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.server_lock_misses); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.server_lock_miss_cycles); - homa_rpc_lock_slow(srpc); - homa_rpc_unlock(srpc); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.server_lock_misses); - EXPECT_NE(0, homa_cores[cpu_number]->metrics.server_lock_miss_cycles); + mock_kmalloc_errors = 0x40; + EXPECT_EQ(ENOMEM, -homa_init(&homa2)); + EXPECT_SUBSTR("Couldn't initialize skb management (errno 12)", + mock_printk_output); + homa_destroy(&homa2); } +#endif /* See strip.py */ -TEST_F(homa_utils, homa_rpc_acked__basics) -{ - struct homa_sock hsk; - mock_sock_init(&hsk, &self->homa, self->server_port); - struct homa_rpc *srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 3000); - ASSERT_NE(NULL, srpc); - struct homa_ack ack = {.client_port = htons(self->client_port), - .server_port = htons(self->server_port), - .client_id = cpu_to_be64(self->client_id)}; - homa_rpc_acked(&hsk, self->client_ip, &ack); - EXPECT_EQ(0, unit_list_length(&hsk.active_rpcs)); - EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); - homa_sock_destroy(&hsk); -} -TEST_F(homa_utils, homa_rpc_acked__lookup_socket) -{ - struct homa_sock hsk; - mock_sock_init(&hsk, &self->homa, self->server_port); - struct homa_rpc *srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 3000); - ASSERT_NE(NULL, srpc); - struct homa_ack ack = {.client_port = htons(self->client_port), - .server_port = htons(self->server_port), - .client_id = cpu_to_be64(self->client_id)}; - homa_rpc_acked(&self->hsk, self->client_ip, &ack); - EXPECT_EQ(0, unit_list_length(&hsk.active_rpcs)); - EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); - homa_sock_destroy(&hsk); -} -TEST_F(homa_utils, homa_rpc_acked__no_such_socket) +TEST_F(homa_utils, homa_destroy) { - struct homa_sock hsk; - mock_sock_init(&hsk, &self->homa, self->server_port); - struct homa_rpc *srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 3000); - ASSERT_NE(NULL, srpc); - struct homa_ack ack = {.client_port = htons(self->client_port), - .server_port = htons(self->server_port+1), - .client_id = cpu_to_be64(self->client_id)}; - homa_rpc_acked(&hsk, self->client_ip, &ack); - EXPECT_EQ(1, unit_list_length(&hsk.active_rpcs)); - EXPECT_STREQ("OUTGOING", homa_symbol_for_state(srpc)); - homa_sock_destroy(&hsk); -} -TEST_F(homa_utils, homa_rpc_acked__no_such_rpc) -{ - struct homa_sock hsk; - mock_sock_init(&hsk, &self->homa, self->server_port); - struct homa_rpc *srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 3000); - ASSERT_NE(NULL, srpc); - struct homa_ack ack = {.client_port = htons(self->client_port), - .server_port = htons(self->server_port), - .client_id = cpu_to_be64(self->client_id+10)}; - homa_rpc_acked(&hsk, self->client_ip, &ack); - EXPECT_EQ(1, unit_list_length(&hsk.active_rpcs)); - EXPECT_STREQ("OUTGOING", homa_symbol_for_state(srpc)); - homa_sock_destroy(&hsk); -} + struct homa homa2; -TEST_F(homa_utils, homa_rpc_free__basics) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 20000); - EXPECT_EQ(1, unit_list_length(&self->homa.grantable_peers)); - ASSERT_NE(NULL, crpc); - unit_log_clear(); - mock_log_rcu_sched = 1; - homa_rpc_free(crpc); - EXPECT_STREQ("homa_remove_from_grantable invoked", - unit_log_get()); - EXPECT_EQ(0, unit_list_length(&self->homa.grantable_peers)); - EXPECT_EQ(NULL, homa_find_client_rpc(&self->hsk, crpc->id)); - EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(1, unit_list_length(&self->hsk.dead_rpcs)); -} -TEST_F(homa_utils, homa_rpc_free__already_dead) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 100); - ASSERT_NE(NULL, crpc); - unit_log_clear(); - homa_rpc_free(crpc); - EXPECT_STREQ("homa_remove_from_grantable invoked", - unit_log_get()); - unit_log_clear(); - homa_rpc_free(crpc); - EXPECT_STREQ("", unit_log_get()); -} -TEST_F(homa_utils, homa_rpc_free__state_ready) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 100); - ASSERT_NE(NULL, crpc); - EXPECT_EQ(1, unit_list_length(&self->hsk.ready_responses)); - homa_rpc_free(crpc); - EXPECT_EQ(0, unit_list_length(&self->hsk.ready_responses)); -} -TEST_F(homa_utils, homa_rpc_free__dead_buffs) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id, 10000, 1000); - ASSERT_NE(NULL, crpc1); - homa_rpc_free(crpc1); - EXPECT_EQ(9, self->homa.max_dead_buffs); - EXPECT_EQ(9, self->hsk.dead_skbs); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 5000, 1000); - ASSERT_NE(NULL, crpc2); - homa_rpc_free(crpc2); - EXPECT_EQ(14, self->homa.max_dead_buffs); - EXPECT_EQ(14, self->hsk.dead_skbs); -} -TEST_F(homa_utils, homa_rpc_free__wakeup_interest) -{ - struct homa_interest interest = {}; - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 100); - ASSERT_NE(NULL, crpc); - atomic_long_set(&interest.ready_rpc, 0); - interest.reg_rpc = crpc; - crpc->interest = &interest; - unit_log_clear(); - homa_rpc_free(crpc); - EXPECT_EQ(NULL, interest.reg_rpc); - EXPECT_STREQ("homa_remove_from_grantable invoked; " - "wake_up_process pid -1", unit_log_get()); + homa_init(&homa2); + homa_destroy(&homa2); } -TEST_F(homa_utils, homa_rpc_free__update_total_incoming) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 20000); - EXPECT_NE(NULL, crpc); - unit_log_clear(); - atomic_set(&self->homa.total_incoming, 10000); - homa_rpc_free(crpc); - EXPECT_EQ(1400, atomic_read(&self->homa.total_incoming)); -} -TEST_F(homa_utils, homa_rpc_free__release_buffers) -{ - struct homa_pool *pool = &self->hsk.buffer_pool; - EXPECT_EQ(0, -homa_pool_init(pool, &self->homa, (void *) 0x1000000, - 100*HOMA_BPAGE_SIZE)); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - 4000, 98, 1000, 150000); - ASSERT_NE(NULL, crpc); - - EXPECT_EQ(0, homa_pool_allocate(crpc)); - EXPECT_EQ(1, atomic_read(&pool->descriptors[1].refs)); - homa_rpc_free(crpc); - EXPECT_EQ(0, atomic_read(&pool->descriptors[1].refs)); -} -TEST_F(homa_utils, homa_rpc_free__remove_from_throttled_list) +TEST_F(homa_utils, homa_net_destroy__delete_sockets) { - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 10000, 1000); - homa_add_to_throttled(crpc); - EXPECT_EQ(1, unit_list_length(&self->homa.throttled_rpcs)); - unit_log_clear(); - homa_rpc_free(crpc); - EXPECT_EQ(0, unit_list_length(&self->homa.throttled_rpcs)); -} + struct homa_sock hsk1, hsk2, hsk3; + struct homa_net *hnet; -TEST_F(homa_utils, homa_rpc_free_rcu) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 20000); - homa_rpc_free(crpc); - EXPECT_EQ(RPC_DEAD, crpc->state); -} + hnet = mock_hnet(1, &self->homa); + mock_sock_init(&hsk1, hnet, 100); + mock_sock_init(&hsk2, hnet, 101); + mock_sock_init(&hsk3, self->hnet, 100); -TEST_F(homa_utils, homa_rpc_reap__basics) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 5000, 2000); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 5000, 100); - struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id+4, 2000, 100); - ASSERT_NE(NULL, crpc1); - ASSERT_NE(NULL, crpc2); - ASSERT_NE(NULL, crpc3); - homa_rpc_free(crpc1); - homa_rpc_free(crpc2); - homa_rpc_free(crpc3); - unit_log_clear(); - EXPECT_STREQ("1234 1236 1238", dead_rpcs(&self->hsk)); - EXPECT_EQ(11, self->hsk.dead_skbs); - unit_log_clear(); - EXPECT_EQ(1, homa_rpc_reap(&self->hsk, 7)); - EXPECT_STREQ("reaped 1234", unit_log_get()); - unit_log_clear(); - EXPECT_STREQ("1236 1238", dead_rpcs(&self->hsk)); - EXPECT_EQ(4, self->hsk.dead_skbs); -} -TEST_F(homa_utils, homa_rpc_reap__protected) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 5000, 2000); - ASSERT_NE(NULL, crpc1); - homa_rpc_free(crpc1); - unit_log_clear(); - homa_protect_rpcs(&self->hsk); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 10)); - homa_unprotect_rpcs(&self->hsk); - EXPECT_STREQ("", unit_log_get()); -} -TEST_F(homa_utils, homa_rpc_reap__skip_rpc_because_of_flags) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 2000); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 1000, 2000); - ASSERT_NE(NULL, crpc1); - ASSERT_NE(NULL, crpc2); - homa_rpc_free(crpc1); - homa_rpc_free(crpc2); - unit_log_clear(); - atomic_or(RPC_COPYING_TO_USER, &crpc1->flags); - EXPECT_EQ(1, homa_rpc_reap(&self->hsk, 3)); - EXPECT_STREQ("reaped 1236", unit_log_get()); - unit_log_clear(); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 3)); - EXPECT_STREQ("", unit_log_get()); - atomic_andnot(RPC_COPYING_TO_USER, &crpc1->flags); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 3)); - EXPECT_STREQ("reaped 1234", unit_log_get()); -} -TEST_F(homa_utils, homa_rpc_reap__skip_rpc_because_of_active_xmits) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 2000); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 1000, 2000); - ASSERT_NE(NULL, crpc1); - ASSERT_NE(NULL, crpc2); - homa_rpc_free(crpc1); - homa_rpc_free(crpc2); - unit_log_clear(); - atomic_inc(&crpc1->msgout.active_xmits); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 100)); - EXPECT_STREQ("reaped 1236", unit_log_get()); - unit_log_clear(); - atomic_dec(&crpc1->msgout.active_xmits); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 100)); - EXPECT_STREQ("reaped 1234", unit_log_get()); -} -TEST_F(homa_utils, homa_rpc_reap__grant_in_progress) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 2000); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 1000, 2000); - ASSERT_NE(NULL, crpc1); - ASSERT_NE(NULL, crpc2); - homa_rpc_free(crpc1); - homa_rpc_free(crpc2); - unit_log_clear(); - atomic_inc(&crpc1->grants_in_progress); - EXPECT_EQ(1, homa_rpc_reap(&self->hsk, 3)); - EXPECT_STREQ("reaped 1236", unit_log_get()); - unit_log_clear(); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 3)); - EXPECT_STREQ("", unit_log_get()); - atomic_dec(&crpc1->grants_in_progress); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 3)); - EXPECT_STREQ("reaped 1234", unit_log_get()); -} -TEST_F(homa_utils, homa_rpc_reap__hit_limit_in_msgin_packets) -{ - struct homa_rpc *srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 10000, 100); - ASSERT_NE(NULL, srpc1); - homa_rpc_free(srpc1); - EXPECT_EQ(8, self->hsk.dead_skbs); - unit_log_clear(); - homa_rpc_reap(&self->hsk, 5); - EXPECT_STREQ("1235", dead_rpcs(&self->hsk)); - EXPECT_EQ(3, self->hsk.dead_skbs); + homa_net_destroy(hnet); + EXPECT_EQ(1, hsk1.shutdown); + EXPECT_EQ(1, hsk2.shutdown); + EXPECT_EQ(0, hsk3.shutdown); + + unit_sock_destroy(&hsk3); } -TEST_F(homa_utils, homa_rpc_reap__nothing_to_reap) +TEST_F(homa_utils, homa_net_destroy__delete_peers) { - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 10)); -} + struct homa_peer *peer; + struct homa_net *hnet; + struct homa_sock hsk2; + struct in6_addr addr; -TEST_F(homa_utils, homa_find_client_rpc) -{ - atomic64_set(&self->homa.next_outgoing_id, 3); - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 10000, 1000); - atomic64_set(&self->homa.next_outgoing_id, 3 + 3*HOMA_CLIENT_RPC_BUCKETS); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 10000, 1000); - atomic64_set(&self->homa.next_outgoing_id, - 3 + 10*HOMA_CLIENT_RPC_BUCKETS); - struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id+4, 10000, 1000); - atomic64_set(&self->homa.next_outgoing_id, 40); - struct homa_rpc *crpc4 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id+6, 10000, 1000); + hnet = mock_hnet(1, &self->homa); + mock_sock_init(&hsk2, hnet, 44); - EXPECT_EQ(crpc1, homa_find_client_rpc(&self->hsk, crpc1->id)); - homa_rpc_unlock(crpc1); - EXPECT_EQ(crpc2, homa_find_client_rpc(&self->hsk, crpc2->id)); - homa_rpc_unlock(crpc2); - EXPECT_EQ(crpc3, homa_find_client_rpc(&self->hsk, crpc3->id)); - homa_rpc_unlock(crpc3); - EXPECT_EQ(crpc4, homa_find_client_rpc(&self->hsk, crpc4->id)); - homa_rpc_unlock(crpc4); - EXPECT_EQ(NULL, homa_find_client_rpc(&self->hsk, 15)); - homa_rpc_free(crpc1); - homa_rpc_free(crpc2); - homa_rpc_free(crpc3); - homa_rpc_free(crpc4); -} + addr = unit_get_in_addr("1.2.3.4"); + peer = homa_peer_get(&hsk2, &addr); + homa_peer_release(peer); + peer = homa_peer_get(&self->hsk, &addr); + homa_peer_release(peer); + addr = unit_get_in_addr("1.2.3.5"); + peer = homa_peer_get(&hsk2, &addr); + homa_peer_release(peer); + EXPECT_EQ(3, unit_count_peers(&self->homa)); -TEST_F(homa_utils, homa_find_server_rpc) -{ - struct homa_rpc *srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 10000, 100); - ASSERT_NE(NULL, srpc1); - struct homa_rpc *srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip, self->server_ip, self->client_port, - self->server_id + 30*HOMA_SERVER_RPC_BUCKETS, - 10000, 100); - ASSERT_NE(NULL, srpc2); - struct homa_rpc *srpc3 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip, self->server_ip, self->client_port+1, - self->server_id + 10*HOMA_SERVER_RPC_BUCKETS, - 10000, 100); - ASSERT_NE(NULL, srpc3); - struct homa_rpc *srpc4 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip, self->server_ip, self->client_port+1, - self->server_id + 4, 10000, 100); - ASSERT_NE(NULL, srpc4); - EXPECT_EQ(srpc1, homa_find_server_rpc(&self->hsk, self->client_ip, - self->client_port, srpc1->id)); - homa_rpc_unlock(srpc1); - EXPECT_EQ(srpc2, homa_find_server_rpc(&self->hsk, self->client_ip, - self->client_port, srpc2->id)); - homa_rpc_unlock(srpc2); - EXPECT_EQ(srpc3, homa_find_server_rpc(&self->hsk, self->client_ip, - self->client_port+1, srpc3->id)); - homa_rpc_unlock(srpc3); - EXPECT_EQ(srpc4, homa_find_server_rpc(&self->hsk, self->client_ip, - self->client_port+1, srpc4->id)); - homa_rpc_unlock(srpc4); - EXPECT_EQ(NULL, homa_find_server_rpc(&self->hsk, self->client_ip, - self->client_port, 3)); + homa_net_destroy(hnet); + EXPECT_EQ(1, unit_count_peers(&self->homa)); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_utils, homa_print_ipv4_addr) { - char *p1, *p2; - int i; - struct in6_addr test_addr1 = unit_get_in_addr("192.168.0.1"); struct in6_addr test_addr2 = unit_get_in_addr("1.2.3.4"); struct in6_addr test_addr3 = unit_get_in_addr("5.6.7.8"); + char *p1, *p2; + int i; + p1 = homa_print_ipv6_addr(&test_addr1); p2 = homa_print_ipv6_addr(&test_addr2); EXPECT_STREQ("192.168.0.1", p1); @@ -668,44 +192,27 @@ TEST_F(homa_utils, homa_snprintf) { char buffer[50]; int used = 0; - used = homa_snprintf(buffer, sizeof32(buffer), used, + + used = homa_snprintf(buffer, sizeof(buffer), used, "Test message with values: %d and %d", 100, 1000); EXPECT_EQ(38, used); EXPECT_STREQ("Test message with values: 100 and 1000", buffer); - used = homa_snprintf(buffer, sizeof32(buffer), used, + used = homa_snprintf(buffer, sizeof(buffer), used, "; plus: %d", 123456); EXPECT_EQ(49, used); EXPECT_STREQ("Test message with values: 100 and 1000; plus: 123", buffer); - used = homa_snprintf(buffer, sizeof32(buffer), used, + used = homa_snprintf(buffer, sizeof(buffer), used, "more text, none of which fits"); EXPECT_EQ(49, used); EXPECT_STREQ("Test message with values: 100 and 1000; plus: 123", buffer); } +#endif /* See strip.py */ -TEST_F(homa_utils, homa_append_metric) -{ - self->homa.metrics_length = 0; - homa_append_metric(&self->homa, "x: %d, y: %d", 10, 20); - EXPECT_EQ(12, self->homa.metrics_length); - EXPECT_STREQ("x: 10, y: 20", self->homa.metrics); - - homa_append_metric(&self->homa, ", z: %d", 12345); - EXPECT_EQ(22, self->homa.metrics_length); - EXPECT_STREQ("x: 10, y: 20, z: 12345", self->homa.metrics); - EXPECT_EQ(30, self->homa.metrics_capacity); - - homa_append_metric(&self->homa, ", q: %050d", 88); - EXPECT_EQ(77, self->homa.metrics_length); - EXPECT_STREQ("x: 10, y: 20, z: 12345, " - "q: 00000000000000000000000000000000000000000000000088", - self->homa.metrics); - EXPECT_EQ(120, self->homa.metrics_capacity); -} - +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_utils, homa_prios_changed__basics) { set_cutoffs(&self->homa, 90, 80, HOMA_MAX_MESSAGE_LENGTH*2, 60, 50, @@ -737,3 +244,4 @@ TEST_F(homa_utils, homa_prios_changed__share_lowest_priority) EXPECT_EQ(0x7fffffff, self->homa.unsched_cutoffs[0]); EXPECT_EQ(0, self->homa.max_sched_prio); } +#endif /* See strip.py */ diff --git a/test/unit_timetrace.c b/test/unit_timetrace.c index cd05946b..bbc250bb 100644 --- a/test/unit_timetrace.c +++ b/test/unit_timetrace.c @@ -1,17 +1,4 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" #define KSELFTEST_NOT_MAIN 1 @@ -28,8 +15,8 @@ FIXTURE_SETUP(timetrace) self->file.private_data = 0; tt_buffer_size = 64; tt_test_no_khz = true; - tt_init("tt", NULL); - mock_cycles = 1000; + tt_init("tt"); + mock_tt_cycles = 1000; } FIXTURE_TEARDOWN(timetrace) { @@ -47,24 +34,25 @@ TEST_F(timetrace, tt_freeze) EXPECT_EQ(0, tt_freeze_count.counter); tt_freeze(); EXPECT_EQ(1, tt_freeze_count.counter); - EXPECT_TRUE(tt_frozen); + EXPECT_TRUE(atomic_read(&tt_frozen)); tt_freeze(); EXPECT_EQ(1, tt_freeze_count.counter); - EXPECT_TRUE(tt_frozen); + EXPECT_TRUE(atomic_read(&tt_frozen)); } TEST_F(timetrace, tt_record__basics) { char buffer[1000]; + memset(buffer, 0, sizeof(buffer)); tt_record("Message with no args"); - mock_cycles++; + mock_tt_cycles++; tt_record1("Message with 1 arg: %d", 99); - mock_cycles++; + mock_tt_cycles++; tt_record2("Message with 2 args: %d %d %d %d", 100, 200); - mock_cycles++; + mock_tt_cycles++; tt_record3("Message with 3 args: %d %d %d %d", 10, 20, 30); - mock_cycles++; + mock_tt_cycles++; tt_record4("Message with 4 args: %d %d %d %d", 1, 2, 3, 4); tt_proc_open(NULL, &self->file); tt_proc_read(&self->file, buffer, sizeof(buffer), 0); @@ -80,16 +68,17 @@ TEST_F(timetrace, tt_record__basics) TEST_F(timetrace, tt_record_buf__wraparound) { char buffer[100]; + memset(buffer, 0, sizeof(buffer)); tt_buffer_size = 4; tt_record("Message 1"); - mock_cycles++; + mock_tt_cycles++; tt_record("Message 2"); - mock_cycles++; + mock_tt_cycles++; tt_record("Message 3"); - mock_cycles++; + mock_tt_cycles++; tt_record("Message 4"); - mock_cycles++; + mock_tt_cycles++; tt_record("Message 5"); tt_proc_open(NULL, &self->file); tt_proc_read(&self->file, buffer, sizeof(buffer), 0); @@ -101,9 +90,9 @@ TEST_F(timetrace, tt_record_buf__wraparound) TEST_F(timetrace, tt_find_oldest) { - int pos[NR_CPUS]; - tt_buffer_size = 4; + int pos[nr_cpu_ids]; + tt_buffer_size = 4; tt_record_buf(tt_buffers[0], 1500, "Buf0", 0, 0, 0, 0); tt_record_buf(tt_buffers[0], 1600, "Buf0", 0, 0, 0, 0); tt_record_buf(tt_buffers[0], 1700, "Buf0", 0, 0, 0, 0); @@ -130,14 +119,18 @@ TEST_F(timetrace, tt_find_oldest) TEST_F(timetrace, tt_proc_open__not_initialized) { + int err; + tt_destroy(); - int err = -tt_proc_open(NULL, &self->file); + err = -tt_proc_open(NULL, &self->file); EXPECT_EQ(EINVAL, err); } TEST_F(timetrace, tt_proc_open__no_memory) { + int err; + mock_kmalloc_errors = 1; - int err = -tt_proc_open(NULL, &self->file); + err = -tt_proc_open(NULL, &self->file); EXPECT_EQ(ENOMEM, err); } TEST_F(timetrace, tt_proc_open__increment_frozen) @@ -149,8 +142,10 @@ TEST_F(timetrace, tt_proc_open__increment_frozen) TEST_F(timetrace, tt_proc_read__bogus_file) { struct tt_proc_file pf; + int err; + pf.file = NULL; - int err = -tt_proc_read(&self->file, (char *) 1000, 100, 0); + err = -tt_proc_read(&self->file, (char *) 1000, 100, 0); EXPECT_EQ(EINVAL, err); self->file.private_data = &pf; err = -tt_proc_read(&self->file, (char *) 1000, 100, 0); @@ -159,14 +154,17 @@ TEST_F(timetrace, tt_proc_read__bogus_file) } TEST_F(timetrace, tt_proc_read__uninitialized) { + int result; + tt_proc_open(NULL, &self->file); tt_destroy(); - int result = tt_proc_read(&self->file, (char *) 1000, 100, 0); + result = tt_proc_read(&self->file, (char *) 1000, 100, 0); EXPECT_EQ(0, result); } TEST_F(timetrace, tt_proc_read__nothing_to_read) { char buffer[1000]; + memset(buffer, 0, sizeof(buffer)); buffer[0] = 0; tt_proc_open(NULL, &self->file); @@ -177,6 +175,7 @@ TEST_F(timetrace, tt_proc_read__nothing_to_read) TEST_F(timetrace, tt_proc_read__leftovers) { char buffer[1000]; + memset(buffer, 0, sizeof(buffer)); tt_pf_storage = 100; tt_record_buf(tt_buffers[0], 1000, @@ -201,6 +200,7 @@ TEST_F(timetrace, tt_proc_read__leftovers) TEST_F(timetrace, tt_proc_read__sort_events_by_time) { char buffer[1000]; + memset(buffer, 0, sizeof(buffer)); tt_record_buf(tt_buffers[0], 1000, "Buf0", 0, 0, 0, 0); tt_record_buf(tt_buffers[0], 1100, "Buf0", 0, 0, 0, 0); @@ -228,6 +228,7 @@ TEST_F(timetrace, tt_proc_read__sort_events_by_time) TEST_F(timetrace, tt_proc_read__event_barely_fits_in_buffer) { char buffer[1000]; + memset(buffer, 0, sizeof(buffer)); tt_pf_storage = 25; tt_record_buf(tt_buffers[0], 1000, @@ -246,6 +247,7 @@ TEST_F(timetrace, tt_proc_read__event_barely_fits_in_buffer) TEST_F(timetrace, tt_proc_read__single_entry_too_large) { char buffer[1000]; + memset(buffer, 0, sizeof(buffer)); tt_pf_storage = 20; tt_record_buf(tt_buffers[0], 1000, @@ -259,8 +261,10 @@ TEST_F(timetrace, tt_proc_read__single_entry_too_large) TEST_F(timetrace, tt_proc_release__bogus_file) { struct tt_proc_file pf; + int err; + pf.file = NULL; - int err = -tt_proc_release(NULL, &self->file); + err = -tt_proc_release(NULL, &self->file); EXPECT_EQ(EINVAL, err); self->file.private_data = &pf; err = -tt_proc_release(NULL, &self->file); @@ -281,20 +285,20 @@ TEST_F(timetrace, tt_proc_release__unfreeze) tt_freeze(); tt_proc_open(NULL, &self->file); EXPECT_EQ(2, tt_freeze_count.counter); - EXPECT_TRUE(tt_frozen); + EXPECT_TRUE(atomic_read(&tt_frozen)); tt_proc_open(NULL, &file2); EXPECT_EQ(3, tt_freeze_count.counter); - EXPECT_TRUE(tt_frozen); + EXPECT_TRUE(atomic_read(&tt_frozen)); tt_proc_release(NULL, &self->file); EXPECT_EQ(2, tt_freeze_count.counter); - EXPECT_TRUE(tt_frozen); + EXPECT_TRUE(atomic_read(&tt_frozen)); EXPECT_NE(NULL, tt_buffers[1]->events[3].format); - EXPECT_EQ(2, tt_buffers[1]->next_index); + EXPECT_EQ(6, atomic_read(&tt_buffers[1]->next_index)); tt_proc_release(NULL, &file2); EXPECT_EQ(0, tt_freeze_count.counter); - EXPECT_FALSE(tt_frozen); + EXPECT_FALSE(atomic_read(&tt_frozen)); EXPECT_EQ(NULL, tt_buffers[1]->events[3].format); - EXPECT_EQ(0, tt_buffers[1]->next_index); -} \ No newline at end of file + EXPECT_EQ(0, atomic_read(&tt_buffers[1]->next_index)); +} diff --git a/test/utils.c b/test/utils.c index f7368771..7193ad03 100644 --- a/test/utils.c +++ b/test/utils.c @@ -1,29 +1,23 @@ -/* Copyright (c) 2019-2023 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file various utility functions for unit testing; this file * is implemented entirely in C, and accesses Homa and kernel internals. */ #include "homa_impl.h" +#include "homa_grant.h" +#include "homa_peer.h" +#include "homa_rpc.h" #include "ccutils.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "mock.h" #include "utils.h" +#ifndef __STRIP__ /* See strip.py */ +#include "homa_pacer.h" +#endif /* See strip.py */ + /** * unit_client_rpc() - Create a homa_client_rpc and arrange for it to be * in a given state. @@ -44,21 +38,21 @@ struct homa_rpc *unit_client_rpc(struct homa_sock *hsk, struct in6_addr *server_ip, int server_port, int id, int req_length, int resp_length) { - int bytes_received; - sockaddr_in_union server_addr; int saved_id = atomic64_read(&hsk->homa->next_outgoing_id); - int incoming_delta = 0; + union sockaddr_in_union server_addr; + int bytes_received, this_size; + struct homa_rpc *crpc; server_addr.in6.sin6_family = AF_INET6; server_addr.in6.sin6_addr = *server_ip; server_addr.in6.sin6_port = htons(server_port); if (id != 0) atomic64_set(&hsk->homa->next_outgoing_id, id); - struct homa_rpc *crpc = homa_rpc_new_client(hsk, &server_addr); + crpc = homa_rpc_alloc_client(hsk, &server_addr); if (IS_ERR(crpc)) return NULL; - if (homa_message_out_init(crpc, unit_iov_iter(NULL, req_length), 0)) { - homa_rpc_free(crpc); + if (homa_message_out_fill(crpc, unit_iov_iter(NULL, req_length), 0)) { + homa_rpc_end(crpc); return NULL; } homa_rpc_unlock(crpc); @@ -69,28 +63,22 @@ struct homa_rpc *unit_client_rpc(struct homa_sock *hsk, return crpc; crpc->msgout.next_xmit_offset = crpc->msgout.length; - struct data_header h = { - .common = { - .sport = htons(server_port), - .dport = htons(hsk->port), - .type = DATA, - .sender_id = cpu_to_be64(id ^ 1) - }, - .message_length = htonl(resp_length), - .incoming = htonl(10000), - .cutoff_version = 0, - .retransmit = 0, - .seg = {.offset = 0, - .segment_length = htonl(UNIT_TEST_DATA_PER_PACKET), - .ack = {0, 0, 0}} + struct homa_data_hdr h; + memset(&h, 0, sizeof(h)); + h.common = (struct homa_common_hdr){ + .sport = htons(server_port), + .dport = htons(hsk->port), + .type = DATA, + .sender_id = cpu_to_be64(id ^ 1) }; + h.message_length = htonl(resp_length); +#ifndef __STRIP__ /* See strip.py */ + h.incoming = htonl(10000); +#endif /* See strip.py */ - int this_size = (resp_length > UNIT_TEST_DATA_PER_PACKET) + this_size = (resp_length > UNIT_TEST_DATA_PER_PACKET) ? UNIT_TEST_DATA_PER_PACKET : resp_length; - h.seg.segment_length = htonl(this_size); - homa_data_pkt(mock_skb_new(server_ip, &h.common, this_size, 0), - crpc, NULL, &incoming_delta); - atomic_add(incoming_delta, &hsk->homa->total_incoming); + homa_dispatch_pkts(mock_skb_alloc(server_ip, &h.common, this_size, 0)); if (state == UNIT_RCVD_ONE_PKT) return crpc; for (bytes_received = UNIT_TEST_DATA_PER_PACKET; @@ -100,16 +88,13 @@ struct homa_rpc *unit_client_rpc(struct homa_sock *hsk, if (this_size > UNIT_TEST_DATA_PER_PACKET) this_size = UNIT_TEST_DATA_PER_PACKET; h.seg.offset = htonl(bytes_received); - h.seg.segment_length = htonl(this_size); - int incoming_delta = 0; - homa_data_pkt(mock_skb_new(server_ip, &h.common, - this_size , 0), crpc, NULL, &incoming_delta); - atomic_add(incoming_delta, &hsk->homa->total_incoming); + homa_dispatch_pkts(mock_skb_alloc(server_ip, &h.common, + this_size, 0)); } if (state == UNIT_RCVD_MSG) return crpc; - FAIL("unit_client_rpc received unexpected state %d", state); - homa_rpc_free(crpc); + FAIL("%s received unexpected state %d", __func__, state); + homa_rpc_end(crpc); return NULL; } @@ -126,15 +111,15 @@ struct in6_addr unit_get_in_addr(char *s) { struct in6_addr ret = {}; unsigned int a, b, c, d; + if (sscanf(s, "%u.%u.%u.%u", &a, &b, &c, &d) == 4) { ret.s6_addr32[3] = htonl((a<<24) + (b<<16) + (c<<8) + d); ret.s6_addr32[2] = htonl(0x0000ffff); } else { - int inet_pton(int af, const char *src, void *dst); int res = inet_pton(AF_INET6, s, &ret); - if (res <= 0) { + + if (res <= 0) abort(); - } } return ret; } @@ -148,6 +133,7 @@ int unit_list_length(struct list_head *head) { struct list_head *pos; int count = 0; + list_for_each(pos, head) { count++; } @@ -162,6 +148,7 @@ int unit_list_length(struct list_head *head) void unit_log_active_ids(struct homa_sock *hsk) { struct homa_rpc *rpc; + list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) unit_log_printf(" ", "%llu", rpc->id); } @@ -173,8 +160,9 @@ void unit_log_active_ids(struct homa_sock *hsk) */ void unit_log_hashed_rpcs(struct homa_sock *hsk) { - int i; struct homa_rpc *rpc; + int i; + for (i = 0; i < HOMA_CLIENT_RPC_BUCKETS; i++) { hlist_for_each_entry_rcu(rpc, &hsk->client_rpc_buckets[i].rpcs, hash_links) { @@ -203,44 +191,48 @@ void unit_log_frag_list(struct sk_buff *skb, int verbose) for (frag = skb_shinfo(skb)->frag_list; frag != NULL; frag = frag->next) { - if (verbose) { + if (verbose) homa_print_packet(frag, buffer, sizeof(buffer)); - } else { + else homa_print_packet_short(frag, buffer, sizeof(buffer)); - } unit_log_printf("; ", "%s", buffer); } } +#ifndef __STRIP__ /* See strip.py */ /** * unit_log_grantables() - Append to the test log information about all of - * the messages in homa->grantable_msgs. + * the messages under grant->grantable_peers. * @homa: Homa's overall state. */ void unit_log_grantables(struct homa *homa) { struct homa_peer *peer; struct homa_rpc *rpc; - int count = 0; - list_for_each_entry(peer, &homa->grantable_peers, grantable_links) { - count++; + int i; + + for (i = 0; i < homa->grant->num_active_rpcs; i++) { + rpc = homa->grant->active_rpcs[i]; + unit_log_printf("; ", "active[%d]: id %llu ungranted %d", + i, rpc->id, + rpc->msgin.length - rpc->msgin.granted); + if (rpc->msgin.rank != i) { + unit_log_printf(" ", "bad rank %d", rpc->msgin.rank); + } + } + list_for_each_entry(peer, &homa->grant->grantable_peers, + grantable_links) { + unit_log_printf("; ", "peer %s:", + homa_print_ipv6_addr(&peer->addr)); list_for_each_entry(rpc, &peer->grantable_rpcs, grantable_links) { - unit_log_printf("; ", "%s from %s, id %lu, " - "remaining %d", - homa_is_client(rpc->id) ? "response" - : "request", - homa_print_ipv6_addr(&peer->addr), - (long unsigned int) rpc->id, - rpc->msgin.bytes_remaining); + unit_log_printf(" ", "id %llu ungranted %d", + rpc->id, + rpc->msgin.length - rpc->msgin.granted); } } - if (count != homa->num_grantable_peers) { - unit_log_printf("; ", "num_grantable_peers error: should " - "be %d, is %d", - count, homa->num_grantable_peers); - } } +#endif /* See strip.py */ /** * unit_log_message_out_packets() - Append to the test log a human-readable @@ -260,11 +252,10 @@ void unit_log_message_out_packets(struct homa_message_out *message, int verbose) for (skb = message->packets; skb != NULL; skb = homa_get_skb_info(skb)->next_skb) { - if (verbose) { + if (verbose) homa_print_packet(skb, buffer, sizeof(buffer)); - } else { + else homa_print_packet_short(skb, buffer, sizeof(buffer)); - } unit_log_printf("; ", "%s", buffer); } } @@ -274,7 +265,7 @@ void unit_log_message_out_packets(struct homa_message_out *message, int verbose) * unit_log_filled_skbs() - Append to the test log a human-readable description * of a list of packet buffers created by homa_fill_packets. * @skb: First in list of sk_buffs to print; the list is linked - * using homa_next_skb. + * using homa_skb_info->next_skb. * @verbose: If non-zero, use homa_print_packet for each packet; * otherwise use homa_print_packet_short. */ @@ -283,11 +274,10 @@ void unit_log_filled_skbs(struct sk_buff *skb, int verbose) char buffer[400]; while (skb != NULL) { - if (verbose) { + if (verbose) homa_print_packet(skb, buffer, sizeof(buffer)); - } else { + else homa_print_packet_short(skb, buffer, sizeof(buffer)); - } unit_log_printf("; ", "%s", buffer); skb = homa_get_skb_info(skb)->next_skb; } @@ -306,15 +296,15 @@ void unit_log_skb_list(struct sk_buff_head *packets, int verbose) char buffer[200]; skb_queue_walk(packets, skb) { - if (verbose) { + if (verbose) homa_print_packet(skb, buffer, sizeof(buffer)); - } else { + else homa_print_packet_short(skb, buffer, sizeof(buffer)); - } unit_log_printf("; ", "%s", buffer); } } +#ifndef __STRIP__ /* See strip.py */ /** * unit_log_throttled() - Append to the test log information about all of * the messages in homa->throttle_rpcs. @@ -323,14 +313,54 @@ void unit_log_skb_list(struct sk_buff_head *packets, int verbose) void unit_log_throttled(struct homa *homa) { struct homa_rpc *rpc; - list_for_each_entry_rcu(rpc, &homa->throttled_rpcs, throttled_links) { - unit_log_printf("; ", "%s id %lu, next_offset %d", + + list_for_each_entry_rcu(rpc, &homa->pacer->throttled_rpcs, throttled_links) { + unit_log_printf("; ", "%s id %llu, next_offset %d", homa_is_client(rpc->id) ? "request" - : "response", - (long unsigned int) rpc->id, + : "response", rpc->id, rpc->msgout.next_xmit_offset); } } +#endif /* See strip.py */ + +/** + * unit_print_gaps() - Returns a static string describing the gaps in an RPC. + * @rpc: Log the gaps in this RPC. + */ +const char *unit_print_gaps(struct homa_rpc *rpc) +{ + struct homa_gap *gap; + static char buffer[1000]; + int used = 0; + + buffer[0] = 0; + list_for_each_entry(gap, &rpc->msgin.gaps, links) { + if (used != 0) + used += snprintf(buffer + used, sizeof(buffer) - used, + "; "); + used += snprintf(buffer + used, sizeof(buffer) - used, + "start %d, end %d", gap->start, gap->end); + if (gap->time != 0) + used += snprintf(buffer + used, sizeof(buffer) - used, + ", time %llu", gap->time); + } + return buffer; +} + +/** + * unit_reset_tx() - Reset the state of an RPC so that it appears no packets + * have been transmitted. + */ +void unit_reset_tx(struct homa_rpc *rpc) +{ + struct sk_buff *skb; + + for (skb = rpc->msgout.packets; skb != NULL; + skb = homa_get_skb_info(skb)->next_skb) + skb_dst_drop(skb); + rpc->msgout.next_xmit = &rpc->msgout.packets; + rpc->msgout.next_xmit_offset = 0; +} /** * unit_server_rpc() - Create a homa_server_rpc and arrange for it to be @@ -352,66 +382,62 @@ struct homa_rpc *unit_server_rpc(struct homa_sock *hsk, struct in6_addr *server_ip, int client_port, int id, int req_length, int resp_length) { - int bytes_received; - int incoming_delta = 0; - struct data_header h = { - .common = { - .sport = htons(client_port), - .dport = htons(hsk->port), - .type = DATA, - .sender_id = cpu_to_be64(id ^ 1) - }, - .message_length = htonl(req_length), - .incoming = htonl(10000), - .cutoff_version = 0, - .retransmit = 0, - .seg = {.offset = 0, - .segment_length = htonl(UNIT_TEST_DATA_PER_PACKET), - .ack = {0, 0, 0}} + int bytes_received, created; + struct homa_data_hdr h; + int status; + + memset(&h, 0, sizeof(h)); + h.common = (struct homa_common_hdr){ + .sport = htons(client_port), + .dport = htons(hsk->port), + .type = DATA, + .sender_id = cpu_to_be64(id ^ 1) }; - if (req_length < UNIT_TEST_DATA_PER_PACKET) - h.seg.segment_length = htonl(req_length); - struct homa_rpc *srpc = homa_rpc_new_server(hsk, client_ip, &h); + h.message_length = htonl(req_length); +#ifndef __STRIP__ /* See strip.py */ + h.incoming = htonl(10000); +#endif /* See strip.py */ + struct homa_rpc *srpc = homa_rpc_alloc_server(hsk, client_ip, &h, + &created); + if (IS_ERR(srpc)) return NULL; EXPECT_EQ(srpc->completion_cookie, 0); homa_rpc_unlock(srpc); - homa_data_pkt(mock_skb_new(client_ip, &h.common, + homa_dispatch_pkts(mock_skb_alloc(client_ip, &h.common, (req_length > UNIT_TEST_DATA_PER_PACKET) - ? UNIT_TEST_DATA_PER_PACKET : req_length , 0), - srpc, NULL, &incoming_delta); - atomic_add(incoming_delta, &hsk->homa->total_incoming); + ? UNIT_TEST_DATA_PER_PACKET : req_length, 0)); if (state == UNIT_RCVD_ONE_PKT) return srpc; for (bytes_received = UNIT_TEST_DATA_PER_PACKET; bytes_received < req_length; bytes_received += UNIT_TEST_DATA_PER_PACKET) { int this_size = req_length - bytes_received; + if (this_size > UNIT_TEST_DATA_PER_PACKET) this_size = UNIT_TEST_DATA_PER_PACKET; h.seg.offset = htonl(bytes_received); - h.seg.segment_length = htonl(this_size); - incoming_delta = 0; - homa_data_pkt(mock_skb_new(client_ip, &h.common, - this_size , 0), srpc, NULL, &incoming_delta); - atomic_add(incoming_delta, &hsk->homa->total_incoming); + homa_dispatch_pkts(mock_skb_alloc(client_ip, &h.common, + this_size, 0)); } if (state == UNIT_RCVD_MSG) return srpc; - list_del_init(&srpc->ready_links); srpc->state = RPC_IN_SERVICE; if (state == UNIT_IN_SERVICE) return srpc; - if (homa_message_out_init(srpc, - unit_iov_iter((void *) 2000, resp_length), 0) != 0) + homa_rpc_lock(srpc); + status = homa_message_out_fill(srpc, unit_iov_iter((void *) 2000, + resp_length), 0); + homa_rpc_unlock(srpc); + if (status != 0) goto error; srpc->state = RPC_OUTGOING; if (state == UNIT_OUTGOING) return srpc; - FAIL("unit_server_rpc received unexpected state %d", state); + FAIL("%s received unexpected state %d", __func__, state); - error: - homa_rpc_free(srpc); +error: + homa_rpc_end(srpc); return NULL; } @@ -435,6 +461,7 @@ struct iov_iter *unit_iov_iter(void *buffer, size_t length) { static struct iovec iovec; static struct iov_iter iter; + iovec.iov_base = buffer; iovec.iov_len = length; iov_iter_init(&iter, WRITE, &iovec, 1, length); @@ -449,9 +476,56 @@ struct iov_iter *unit_iov_iter(void *buffer, size_t length) char *unit_ack_string(struct homa_ack *ack) { static char buffer[1000]; + snprintf(buffer, sizeof(buffer), - "client_port %d, server_port %d, client_id %llu", - ntohs(ack->client_port), ntohs(ack->server_port), - be64_to_cpu(ack->client_id)); + "server_port %d, client_id %llu", + ntohs(ack->server_port), be64_to_cpu(ack->client_id)); return buffer; } + +/** + * unit_homa_destroy() - When unit tests are run, this function is invoked + * by homa_destroy. It checks for various errors and reports them. + * @homa: Homa shared data that is about to be deleted. + */ +void unit_homa_destroy(struct homa *homa) +{ + /* Currently nothing to check. */ +} + +/** + * unit_sock_destroy() - Invoked by unit tests to cleanup and destroy + * a socket. + * @hsk: Socket to destroy. + */ +void unit_sock_destroy(struct homa_sock *hsk) +{ + homa_sock_shutdown(hsk); + homa_sock_destroy(&hsk->sock); +} + +/** + * unit_log_peers() - Return a count of the number of peers in the + * homa_peertab for @homa (could also include peers from other homas). + * @homa: Used to locate homa_peertab to count. + */ +int unit_count_peers(struct homa *homa) +{ + struct rhashtable_iter iter; + struct homa_peer *peer; + int count = 0; + + rhashtable_walk_enter(&homa->peertab->ht, &iter); + rhashtable_walk_start(&iter); + while (1) { + peer = rhashtable_walk_next(&iter); + if (!peer) + break; + if (IS_ERR(peer)) + continue; + count++; + } + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); + return count; +} \ No newline at end of file diff --git a/test/utils.h b/test/utils.h index 5a48796f..5c739872 100644 --- a/test/utils.h +++ b/test/utils.h @@ -1,20 +1,9 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* Utility functions for unit tests, implemented in C. */ +struct homa_message_out; +struct homa_rpc; struct unit_hash; /** @@ -41,30 +30,45 @@ enum unit_rpc_state { UNIT_IN_SERVICE = 24, }; -extern char *unit_ack_string(struct homa_ack *ack); -extern struct homa_rpc - *unit_client_rpc(struct homa_sock *hsk, - enum unit_rpc_state state, struct in6_addr *client_ip, - struct in6_addr *server_ip, int server_port, int id, - int req_length, int resp_length); -extern struct in6_addr - unit_get_in_addr(char *s); -extern struct iov_iter - *unit_iov_iter(void *buffer, size_t length); -extern int unit_list_length(struct list_head *head); -extern void unit_log_active_ids(struct homa_sock *hsk); -extern void unit_log_filled_skbs(struct sk_buff *skb, int verbose); -extern void unit_log_frag_list(struct sk_buff *skb, int verbose); -extern void unit_log_grantables(struct homa *homa); -extern void unit_log_hashed_rpcs(struct homa_sock *hsk); -extern void unit_log_message_out_packets( - struct homa_message_out *message, int verbose); -extern struct homa_rpc - *unit_server_rpc(struct homa_sock *hsk, - enum unit_rpc_state state, struct in6_addr *server_ip, - struct in6_addr *client_ip, int client_port, int id, - int req_length, int resp_length); -extern void unit_log_skb_list(struct sk_buff_head *packets, - int verbose); -extern void unit_log_throttled(struct homa *homa); -extern void unit_teardown(void); +char *unit_ack_string(struct homa_ack *ack); +struct homa_rpc + *unit_client_rpc(struct homa_sock *hsk, + enum unit_rpc_state state, struct in6_addr *client_ip, + struct in6_addr *server_ip, int server_port, int id, + int req_length, int resp_length); +int unit_count_peers(struct homa *homa); +struct in6_addr + unit_get_in_addr(char *s); +void unit_homa_destroy(struct homa *homa); +struct iov_iter + *unit_iov_iter(void *buffer, size_t length); +int unit_list_length(struct list_head *head); +void unit_log_active_ids(struct homa_sock *hsk); +void unit_log_filled_skbs(struct sk_buff *skb, int verbose); +void unit_log_frag_list(struct sk_buff *skb, int verbose); +void unit_log_hashed_rpcs(struct homa_sock *hsk); +void unit_log_message_out_packets(struct homa_message_out *message, + int verbose); +void unit_log_skb_list(struct sk_buff_head *packets, + int verbose); +const char *unit_print_gaps(struct homa_rpc *rpc); +void unit_reset_tx(struct homa_rpc *rpc); +struct homa_rpc + *unit_server_rpc(struct homa_sock *hsk, + enum unit_rpc_state state, + struct in6_addr *server_ip, + struct in6_addr *client_ip, + int client_port, int id, int req_length, + int resp_length); +void unit_sock_destroy(struct homa_sock *hsk); +void unit_teardown(void); + +#ifndef __STRIP__ /* See strip.py */ +void unit_log_grantables(struct homa *homa); +void unit_log_throttled(struct homa *homa); +#endif /* See strip.py */ + +/* Kludge to avoid including arpa/inet.h, which causes definition + * conflicts with kernel header files. + */ +int inet_pton(int af, const char *src, void *dst); diff --git a/timetrace.c b/timetrace.c index 216b0408..e196ee48 100644 --- a/timetrace.c +++ b/timetrace.c @@ -1,50 +1,49 @@ -/* Copyright (c) 2019-2020 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" +#include +#pragma GCC diagnostic pop + #ifndef __UNIT_TEST__ /* Uncomment the line below if the main Linux kernel has been compiled with * timetrace stubs; we will then connect the timetrace mechanism here with * those stubs to allow the rest of the kernel to log in our buffers. */ //#define TT_KERNEL 1 -#endif +#endif /* __UNIT_TEST__ */ #ifdef TT_KERNEL -extern int tt_linux_buffer_mask; extern struct tt_buffer *tt_linux_buffers[]; +extern void (*tt_linux_freeze)(void); extern atomic_t *tt_linux_freeze_count; extern atomic_t tt_linux_freeze_no_homa; extern int *tt_linux_homa_temp; -extern int tt_linux_homa_temp_default[]; -extern void tt_inc_metric(int metric, __u64 count); -extern void (*tt_linux_inc_metrics)(int metric, __u64 count); -extern void tt_linux_skip_metrics(int metric, __u64 count); +extern int tt_linux_homa_temp_default[16]; +extern void (*tt_linux_inc_metrics)(int metric, u64 count); +extern void (*tt_linux_record)(struct tt_buffer *buffer, u64 timestamp, + const char *format, u32 arg0, u32 arg1, + u32 arg2, u32 arg3); +extern void tt_linux_skip_metrics(int metric, u64 count); extern void (*tt_linux_printk)(void); extern void (*tt_linux_dbg1)(char *msg, ...); extern void (*tt_linux_dbg2)(char *msg, ...); extern void (*tt_linux_dbg3)(char *msg, ...); extern void tt_linux_nop(void); -extern void homa_trace(__u64 u0, __u64 u1, int i0, int i1); +extern void homa_trace(u64 u0, u64 u1, int i0, int i1); + +extern void ltt_record_nop(struct tt_buffer *buffer, u64 timestamp, + const char *format, u32 arg0, u32 arg1, + u32 arg2, u32 arg3); #endif +void tt_inc_metric(int metric, u64 count); /* Separate buffers for each core: this eliminates the need for * synchronization in tt_record, which improves performance significantly. * NR_CPUS is an overestimate of the actual number of cores; we use it * here, rather than nr_cpu_ids, because it allows for static allocation - * of this array. And + * of this array. */ struct tt_buffer *tt_buffers[NR_CPUS]; @@ -65,7 +64,7 @@ static struct proc_dir_entry *tt_dir_entry; * isn't safe here, because tt_freeze gets called at times when threads * can't sleep. */ -static spinlock_t tt_lock; +static struct mutex tt_mutex; /* No new timetrace entries will be made whenever this is nonzero (counts * the number of active /proc reads, plus 1 more if tt_frozen is true). @@ -76,7 +75,7 @@ atomic_t tt_freeze_count = {.counter = 1}; /* True means that tt_freeze has been called since the last time the * timetrace was read. */ -bool tt_frozen; +atomic_t tt_frozen; /* True means timetrace has been successfully initialized. */ static bool init; @@ -91,69 +90,69 @@ int tt_buffer_size = TT_BUF_SIZE; int tt_pf_storage = TT_PF_BUF_SIZE; /* Set during tests to disable "cpu_khz" line in trace output. */ -bool tt_test_no_khz = false; +bool tt_test_no_khz; /** * tt_init(): Enable time tracing, create /proc file for reading traces. * @proc_file: Name of a file in /proc; this file can be read to extract - * the current timetrace. - * @temp: Pointer to homa's "temp" configuration parameters, which - * we should make available to the kernel. NULL means no - * such variables available. + * the current timetrace. NULL means don't create a /proc file + * (such as when running unit tests). * * Return : 0 means success, anything else means an error occurred (a * log message will be printed to describe the error). */ -int tt_init(char *proc_file, int *temp) +int tt_init(char *proc_file) { int i; - if (init) { + pr_notice("tt_init invoked, init %d, proc_file %s\n", init, proc_file); + + if (init) return 0; - } for (i = 0; i < nr_cpu_ids; i++) { struct tt_buffer *buffer; + buffer = kmalloc(sizeof(*buffer), GFP_KERNEL); - if (buffer == NULL) { - printk(KERN_ERR "timetrace couldn't allocate " - "tt_buffers\n"); + if (!buffer) goto error; - } memset(buffer, 0, sizeof(*buffer)); tt_buffers[i] = buffer; } - tt_dir_entry = proc_create(proc_file, S_IRUGO, NULL, &tt_pops); - if (!tt_dir_entry) { - printk(KERN_ERR "couldn't create /proc/%s for timetrace " - "reading\n", proc_file); - goto error; + if (proc_file) { + tt_dir_entry = proc_create(proc_file, 0444, NULL, &tt_pops); + if (!tt_dir_entry) { + pr_err("couldn't create /proc/%s for timetrace reading\n", + proc_file); + goto error; + } + } else { + tt_dir_entry = NULL; } - spin_lock_init(&tt_lock); - tt_freeze_count.counter = 0; - tt_frozen = false; - init = true; - #ifdef TT_KERNEL - for (i = 0; i < nr_cpu_ids; i++) { + for (i = 0; i < nr_cpu_ids; i++) tt_linux_buffers[i] = tt_buffers[i]; - } - tt_linux_buffer_mask = TT_BUF_SIZE-1; + tt_linux_record = tt_record_buf; + tt_linux_freeze = tt_freeze; tt_linux_freeze_count = &tt_freeze_count; tt_linux_inc_metrics = tt_inc_metric; tt_linux_printk = tt_printk; tt_linux_dbg1 = tt_dbg1; tt_linux_dbg2 = tt_dbg2; tt_linux_dbg3 = tt_dbg3; - if (temp) - tt_linux_homa_temp = temp; + memset(tt_debug_int64, 0, sizeof(tt_debug_int64)); #endif + mutex_init(&tt_mutex); + atomic_set(&tt_frozen, 0); + atomic_set(&tt_freeze_count, 0); + init = true; + return 0; - error: +error: for (i = 0; i < nr_cpu_ids; i++) { kfree(tt_buffers[i]); tt_buffers[i] = NULL; @@ -162,28 +161,44 @@ int tt_init(char *proc_file, int *temp) } /** - * @tt_destroy(): Disable time tracing and disable the /proc file for + * tt_set_temp() - Make the "temp" variables from a struct homa available + * to the rest of the Linux kernel. + * @temp: Pointer to homa's "temp" configuration parameters, which + * we should make available to the kernel. + */ +void tt_set_temp(int *temp) +{ +#ifdef TT_KERNEL + tt_linux_homa_temp = temp; +#endif +} + +/** + * tt_destroy(): Disable time tracing and disable the /proc file for * reading traces. */ void tt_destroy(void) { int i; - spin_lock(&tt_lock); + + tt_freeze_count.counter = 1; + mutex_lock(&tt_mutex); if (init) { init = false; - proc_remove(tt_dir_entry); + if (tt_dir_entry) + proc_remove(tt_dir_entry); } for (i = 0; i < nr_cpu_ids; i++) { kfree(tt_buffers[i]); tt_buffers[i] = NULL; } - tt_freeze_count.counter = 1; #ifdef TT_KERNEL + tt_linux_record = ltt_record_nop; + tt_linux_freeze = tt_linux_nop; tt_linux_freeze_count = &tt_linux_freeze_no_homa; - for (i = 0; i < nr_cpu_ids; i++) { + for (i = 0; i < nr_cpu_ids; i++) tt_linux_buffers[i] = NULL; - } tt_linux_inc_metrics = tt_linux_skip_metrics; tt_linux_printk = tt_linux_nop; tt_linux_dbg1 = (void (*)(char *, ...)) tt_linux_nop; @@ -196,26 +211,37 @@ void tt_destroy(void) tt_linux_homa_temp = tt_linux_homa_temp_default; #endif - spin_unlock(&tt_lock); + mutex_unlock(&tt_mutex); } /** - * Stop recording timetrace events until the trace has been read - * using the /proc file. When recording resumes after reading the + * tt_freeze() - Stop recording timetrace events until the trace has been + * read using the /proc file. When recording resumes after reading the * file, the buffers will be cleared. */ void tt_freeze(void) { - if (tt_frozen) - return; - tt_record("timetrace frozen"); - printk(KERN_NOTICE "tt_freeze invoked\n"); - spin_lock(&tt_lock); - if (!tt_frozen) { - tt_frozen = true; + /* Need to synchronize here to make sure tt_freeze_count only + * gets incremented once, even under concurrent calls to this + * function. + */ + if (atomic_xchg(&tt_frozen, 1) == 0) { + tt_record("timetrace frozen"); atomic_inc(&tt_freeze_count); + pr_err("%s invoked\n", __func__); + } +} + +/** + * tt_unfreeze() - Release any freeze that may be in effect: normal + * timetrace recording will resume if it had stopped. + */ +void tt_unfreeze(void) +{ + pr_err("%s invoked\n", __func__); + if (atomic_xchg(&tt_frozen, 0) == 1) { + atomic_dec(&tt_freeze_count); } - spin_unlock(&tt_lock); } /** @@ -232,16 +258,18 @@ void tt_freeze(void) * to this method. This pointer is stored in the buffer, so * the caller must ensure that its contents will not change * over its lifetime in the trace. - * @arg0 Argument to use when printing a message about this event. - * @arg1 Argument to use when printing a message about this event. - * @arg2 Argument to use when printing a message about this event. - * @arg3 Argument to use when printing a message about this event. + * @arg0: Argument to use when printing a message about this event. + * @arg1: Argument to use when printing a message about this event. + * @arg2: Argument to use when printing a message about this event. + * @arg3: Argument to use when printing a message about this event. */ -void tt_record_buf(struct tt_buffer *buffer, __u64 timestamp, - const char* format, __u32 arg0, __u32 arg1, __u32 arg2, - __u32 arg3) +void tt_record_buf(struct tt_buffer *buffer, u64 timestamp, + const char *format, u32 arg0, u32 arg1, u32 arg2, + u32 arg3) { struct tt_event *event; + int index; + if (unlikely(atomic_read(&tt_freeze_count) > 0)) { // In order to ensure that reads produce consistent // results, don't record concurrently (this could cause @@ -249,13 +277,14 @@ void tt_record_buf(struct tt_buffer *buffer, __u64 timestamp, return; } - event = &buffer->events[buffer->next_index]; - buffer->next_index = (buffer->next_index + 1) -#ifdef __UNIT_TEST__ - & (tt_buffer_size-1); -#else - & (TT_BUF_SIZE-1); -#endif + /* Even though there is a separate tt buffer for each core, we + * still have to use an atomic operation to update next_index, + * because an interrupt could occur while executing this function. + * Before the atomic increment was added, tt entries were occasionally + * lost. + */ + index = atomic_fetch_inc_relaxed(&buffer->next_index) & TT_BUF_MASK; + event = &buffer->events[index]; event->timestamp = timestamp; event->format = format; @@ -267,7 +296,7 @@ void tt_record_buf(struct tt_buffer *buffer, __u64 timestamp, /** * tt_find_oldest() - This function is invoked when printing out the - * Timetrace: it finds the oldest event to print from each trace. + * timetrace: it finds the oldest event to print from each trace. * This will be events[0] if we never completely filled the buffer, * otherwise events[nextIndex+1]. This means we don't print the entry at * nextIndex; this is convenient because it simplifies boundary checks @@ -277,25 +306,26 @@ void tt_record_buf(struct tt_buffer *buffer, __u64 timestamp, * complete, since there may have been events that were discarded). * @pos: Array with NPOS elements; will be filled in with the oldest * index in the trace for each core. + * Return: Time of oldest log entry that should be printed. */ -void tt_find_oldest(int *pos) +u64 tt_find_oldest(int *pos) { - struct tt_buffer* buffer; + struct tt_buffer *buffer; + u64 start_time = 0; int i; - __u64 start_time = 0; for (i = 0; i < nr_cpu_ids; i++) { buffer = tt_buffers[i]; - if (buffer->events[tt_buffer_size-1].format == NULL) { + if (!buffer->events[tt_buffer_size - 1].format) { pos[i] = 0; } else { - int index = (buffer->next_index + 1) - & (tt_buffer_size-1); + int index = (atomic_read(&buffer->next_index) + 1) + & TT_BUF_MASK; struct tt_event *event = &buffer->events[index]; + pos[i] = index; - if (event->timestamp > start_time) { + if (event->timestamp > start_time) start_time = event->timestamp; - } } } @@ -303,12 +333,16 @@ void tt_find_oldest(int *pos) * sure that there's no missing data in what we print. */ for (i = 0; i < nr_cpu_ids; i++) { + int next; + buffer = tt_buffers[i]; - while ((buffer->events[pos[i]].timestamp < start_time) - && (pos[i] != buffer->next_index)) { - pos[i] = (pos[i] + 1) & (tt_buffer_size-1); + next = tt_get_buf_index(buffer); + while (buffer->events[pos[i]].timestamp < start_time && + pos[i] != next) { + pos[i] = (pos[i] + 1) & TT_BUF_MASK; } } + return start_time; } /** @@ -321,16 +355,16 @@ void tt_find_oldest(int *pos) */ int tt_proc_open(struct inode *inode, struct file *file) { - struct tt_proc_file* pf = NULL; + struct tt_proc_file *pf = NULL; int result = 0; - spin_lock(&tt_lock); + mutex_lock(&tt_mutex); if (!init) { result = -EINVAL; goto done; } - pf = kmalloc(sizeof(*pf), GFP_KERNEL); - if (pf == NULL) { + pf = kmalloc(sizeof(*pf), GFP_ATOMIC); + if (!pf) { result = -ENOMEM; goto done; } @@ -344,40 +378,41 @@ int tt_proc_open(struct inode *inode, struct file *file) if (!tt_test_no_khz) { pf->bytes_available = snprintf(pf->msg_storage, TT_PF_BUF_SIZE, - "cpu_khz: %u\n", cpu_khz); + "cpu_khz: %u\n", tsc_khz); } - done: - spin_unlock(&tt_lock); +done: + mutex_unlock(&tt_mutex); return result; } /** * tt_proc_read() - This function is invoked to handle read kernel calls on * /proc/timetrace. - * @file: Information about the file being read. - * @buffer: Address in user space of the buffer in which data from the file - * should be returned. - * @length: Number of bytes available at @buffer. - * @offset: Current read offset within the file. For now, we assume I/O - * is done sequentially, so we ignore this. + * @file: Information about the file being read. + * @user_buf: Address in user space of the buffer in which data from the file + * should be returned. + * @length: Number of bytes available at @buffer. + * @offset: Current read offset within the file. For now, we assume I/O + * is done sequentially, so we ignore this. * * Return: the number of bytes returned at @buffer. 0 means the end of the * file was reached, and a negative number indicates an error (-errno). */ ssize_t tt_proc_read(struct file *file, char __user *user_buf, - size_t length, loff_t *offset) + size_t length, loff_t *offset) { + struct tt_proc_file *pf = file->private_data; + /* # bytes of data that have accumulated in pf->msg_storage but * haven't been copied to user space yet. */ int copied_to_user = 0; - struct tt_proc_file *pf = file->private_data; - spin_lock(&tt_lock); - if ((pf == NULL) || (pf->file != file)) { - printk(KERN_ERR "tt_metrics_read found damaged " - "private_data: 0x%p\n", file->private_data); + mutex_lock(&tt_mutex); + if (!pf || pf->file != file) { + pr_err("tt_metrics_read found damaged private_data: 0x%p\n", + file->private_data); copied_to_user = -EINVAL; goto done; } @@ -393,35 +428,34 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, struct tt_event *event; int entry_length, chunk_size, available, i, failed_to_copy; int current_core = -1; - __u64 earliest_time = ~0; + u64 earliest_time = ~0; /* Check all the traces to find the earliest available event. */ for (i = 0; i < nr_cpu_ids; i++) { struct tt_buffer *buffer = tt_buffers[i]; + event = &buffer->events[pf->pos[i]]; - if ((pf->pos[i] != buffer->next_index) - && (event->timestamp < earliest_time)) { - current_core = i; - earliest_time = event->timestamp; + if (pf->pos[i] != tt_get_buf_index(buffer) && + event->timestamp < earliest_time) { + current_core = i; + earliest_time = event->timestamp; } } if (current_core < 0) { - /* None of the traces have any more events to process. */ - goto flush; + /* None of the traces have any more events. */ + goto flush; } /* Format one event. */ - event = &(tt_buffers[current_core]->events[ - pf->pos[current_core]]); + event = &(tt_buffers[current_core]->events[pf->pos[current_core]]); available = tt_pf_storage - (pf->next_byte + pf->bytes_available - pf->msg_storage); - if (available == 0) { + if (available == 0) goto flush; - } entry_length = snprintf(pf->next_byte + pf->bytes_available, - available, "%lu [C%02d] ", - (long unsigned int) event->timestamp, - current_core); + available, "%lu [C%02d] ", + (unsigned long)event->timestamp, + current_core); if (available >= entry_length) entry_length += snprintf(pf->next_byte + pf->bytes_available + entry_length, @@ -432,7 +466,8 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, /* Not enough room for this entry. */ if (pf->bytes_available == 0) { /* Even a full buffer isn't enough for - * this entry; truncate the entry. */ + * this entry; truncate the entry. + */ entry_length = available - 1; } else { goto flush; @@ -442,18 +477,17 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, pf->next_byte[pf->bytes_available + entry_length] = '\n'; pf->bytes_available += entry_length + 1; pf->pos[current_core] = (pf->pos[current_core] + 1) - & (tt_buffer_size-1); + & (tt_buffer_size - 1); continue; - flush: +flush: chunk_size = pf->bytes_available; - if (chunk_size > (length - copied_to_user)) { + if (chunk_size > (length - copied_to_user)) chunk_size = length - copied_to_user; - } if (chunk_size == 0) goto done; failed_to_copy = copy_to_user(user_buf + copied_to_user, - pf->next_byte, chunk_size); + pf->next_byte, chunk_size); chunk_size -= failed_to_copy; pf->bytes_available -= chunk_size; if (pf->bytes_available == 0) @@ -468,12 +502,11 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, } } - done: - spin_unlock(&tt_lock); +done: + mutex_unlock(&tt_mutex); return copied_to_user; } - /** * tt_proc_lseek() - This function is invoked to handle seeks on * /proc/timetrace. Right now seeks are ignored: the file must be @@ -481,6 +514,7 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, * @file: Information about the file being read. * @offset: Distance to seek, in bytes * @whence: Starting point from which to measure the distance to seek. + * Return: current position within file. */ loff_t tt_proc_lseek(struct file *file, loff_t offset, int whence) { @@ -497,25 +531,24 @@ loff_t tt_proc_lseek(struct file *file, loff_t offset, int whence) */ int tt_proc_release(struct inode *inode, struct file *file) { + struct tt_proc_file *pf = file->private_data; int i; - struct tt_proc_file *pf = file->private_data; - if ((pf == NULL) || (pf->file != file)) { - printk(KERN_ERR "tt_metrics_release found damaged " - "private_data: 0x%p\n", file->private_data); + if (!pf || pf->file != file) { + pr_err("%s found damaged private_data: 0x%p\n", __func__, + file->private_data); return -EINVAL; } kfree(pf); file->private_data = NULL; - spin_lock(&tt_lock); + mutex_lock(&tt_mutex); if (init) { - if (tt_frozen && (atomic_read(&tt_freeze_count) == 2)) { + if (atomic_read(&tt_freeze_count) == 2 && + atomic_xchg(&tt_frozen, 0) == 1) atomic_dec(&tt_freeze_count); - tt_frozen = false; - } if (atomic_read(&tt_freeze_count) == 1) { /* We are the last active open of the file; reset all of @@ -523,24 +556,35 @@ int tt_proc_release(struct inode *inode, struct file *file) */ for (i = 0; i < nr_cpu_ids; i++) { struct tt_buffer *buffer = tt_buffers[i]; - buffer->events[tt_buffer_size-1].format = NULL; - buffer->next_index = 0; + + buffer->events[tt_buffer_size - 1].format = NULL; + atomic_set(&buffer->next_index, 0); } } atomic_dec(&tt_freeze_count); } - spin_unlock(&tt_lock); + mutex_unlock(&tt_mutex); return 0; } /** - * tt_printk() - Print the contents of the timetrace to the system log. + * tt_print_file() - Print the contents of the timetrace to a given file. * Useful in situations where the system is too unstable to extract a - * timetrace by reading /proc/timetrace. + * timetrace by reading /proc/timetrace. Unfortunately, this function cannot + * be invoked when preemption was disabled (e.g., when holding a spin lock). + * As of 2/2024, this function is not reliable in situations where the machine + * is about to crash. It seems to print the trace, but after reboot the + * file isn't there. + * @path: Name of the file in which to print the timetrace; should be + * an absolute file name. */ -void tt_printk(void) +void tt_print_file(char *path) { + /* Static buffer for accumulating output data. */ + static char buffer[10000]; + struct file *filp = NULL; + /* Index of the next entry to return from each tt_buffer. * This array is too large to allocate on the stack, and we don't * want to allocate space dynamically (this function could be @@ -550,65 +594,291 @@ void tt_printk(void) */ static int pos[NR_CPUS]; static atomic_t active; + int bytes_used = 0; + loff_t offset = 0; + int err; if (atomic_xchg(&active, 1)) { - printk(KERN_NOTICE "concurrent call to tt_printk aborting\n"); + pr_err("concurrent call to %s aborting\n", __func__); return; } if (!init) return; + + filp = filp_open(path, O_WRONLY | O_CREAT, 0666); + if (IS_ERR(filp)) { + pr_err("%s couldn't open %s: error %ld\n", __func__, path, + -PTR_ERR(filp)); + filp = NULL; + goto done; + } + atomic_inc(&tt_freeze_count); tt_find_oldest(pos); + bytes_used += snprintf(buffer + bytes_used, + sizeof(buffer) - bytes_used, + "cpu_khz: %u\n", tsc_khz); + /* Each iteration of this loop printk's one event. */ while (true) { + u64 earliest_time = ~0; struct tt_event *event; + int current_core = -1; int i; + + /* Check all the traces to find the earliest available event. */ + for (i = 0; i < nr_cpu_ids; i++) { + struct tt_buffer *buffer = tt_buffers[i]; + + event = &buffer->events[pos[i]]; + if (pos[i] != tt_get_buf_index(buffer) && + event->timestamp < earliest_time) { + current_core = i; + earliest_time = event->timestamp; + } + } + if (current_core < 0) { + /* None of the traces have any more events. */ + break; + } + event = &(tt_buffers[current_core]->events[pos[current_core]]); + pos[current_core] = (pos[current_core] + 1) & + (tt_buffer_size - 1); + + bytes_used += snprintf(buffer + bytes_used, + sizeof(buffer) - bytes_used, + "%lu [C%02d] ", + (unsigned long)event->timestamp, + current_core); + bytes_used += snprintf(buffer + bytes_used, + sizeof(buffer) - bytes_used, + event->format, event->arg0, + event->arg1, event->arg2, event->arg3); + if (bytes_used < sizeof(buffer)) { + buffer[bytes_used] = '\n'; + bytes_used++; + } + if ((bytes_used + 1000) >= sizeof(buffer)) { + err = kernel_write(filp, buffer, bytes_used, + &offset); + if (err < 0) { + pr_notice("%s got error %d writing %s\n", + __func__, -err, path); + goto done; + } + bytes_used = 0; + } + } + if (bytes_used > 0) { + err = kernel_write(filp, buffer, bytes_used, &offset); + if (err < 0) + pr_err("%s got error %d writing %s\n", + __func__, -err, path); + } + +done: + if (filp) { + err = vfs_fsync(filp, 0); + if (err < 0) + pr_err("%s got error %d in fsync\n", __func__, -err); + err = filp_close(filp, NULL); + if (err < 0) + pr_err("%s got error %d in filp_close\n", __func__, + -err); + } + atomic_dec(&tt_freeze_count); + atomic_set(&active, 0); +} + +/** + * tt_printk() - Print the contents of the timetrace to the system log. + * Useful in situations where the system is too unstable to extract a + * timetrace by reading /proc/timetrace. Note: the timetrace is printed + * most recent entry first (in the hopes that if buffer overflows + * disrupt the output, at least the most recent entries will be complete). + */ +void tt_printk(void) +{ + /* Index of the oldest entry to return from each tt_buffer. This + * array is too large to allocate on the stack, and we don't want to + * allocate space dynamically (this function could be called at a + * point where the world is going to hell). So, allocate the array + * statically and only allow one concurrent call to this function. + */ + static int oldest[NR_CPUS]; + static atomic_t active; + + /* Index of the next entry to consider from each tt_buffer, or -1 if + * all entries have been processed. + */ + static int pos[NR_CPUS]; + u64 start_time; + int events; + int i; + + if (atomic_xchg(&active, 1)) { + pr_err("concurrent call to %s aborting\n", __func__); + return; + } + if (!init) + return; + atomic_inc(&tt_freeze_count); + pr_err("Dumping timetrace on core %d\n", raw_smp_processor_id()); + start_time = tt_find_oldest(oldest); + events = 0; + for (i = 0; i < nr_cpu_ids; i++) { + if (oldest[i] == tt_get_buf_index(tt_buffers[i])) + pos[i] = -1; + else + pos[i] = (atomic_read(&tt_buffers[i]->next_index) - 1) & + TT_BUF_MASK; + } + +#if 0 + /* Limit the number of entries logged per core (logging too many + * seems to cause entries to be lost). + */ + for (i = 0; i < nr_cpu_ids; i++) { + if (((pos[i] - oldest[i]) & (TT_BUF_SIZE - 1)) > 200) + oldest[i] = (pos[i] - 200) & (TT_BUF_SIZE - 1); + } +#endif + + pr_err("cpu_khz: %u, start: %llu\n", tsc_khz, start_time); + + /* Each iteration of this loop printk's one event. */ + while (true) { + u64 latest_time = 0; + struct tt_event *event; int current_core = -1; - __u64 earliest_time = ~0; char msg[200]; - /* Check all the traces to find the earliest available event. */ + /* Check all the traces to find the latest available event. */ for (i = 0; i < nr_cpu_ids; i++) { struct tt_buffer *buffer = tt_buffers[i]; + + if (pos[i] == -1) + continue; event = &buffer->events[pos[i]]; - if ((pos[i] != buffer->next_index) - && (event->timestamp < earliest_time)) { - current_core = i; - earliest_time = event->timestamp; + if (event->timestamp >= latest_time) { + current_core = i; + latest_time = event->timestamp; } } if (current_core < 0) { - /* None of the traces have any more events to process. */ - break; + /* None of the traces have any more events. */ + break; } - event = &(tt_buffers[current_core]->events[ - pos[current_core]]); - pos[current_core] = (pos[current_core] + 1) - & (tt_buffer_size-1); + event = &(tt_buffers[current_core]->events[pos[current_core]]); + if (pos[current_core] == oldest[current_core]) + pos[current_core] = -1; + else + pos[current_core] = (pos[current_core] - 1) + & (tt_buffer_size - 1); snprintf(msg, sizeof(msg), event->format, event->arg0, - event->arg1, event->arg2, event->arg3); - printk(KERN_NOTICE "%lu [C%02d] %s\n", - (long unsigned int) event->timestamp, - current_core, msg); + event->arg1, event->arg2, event->arg3); + pr_err("%lu [C%02d] %s\n", + (unsigned long)event->timestamp, + current_core, msg); + events++; } + pr_err("Finished dumping %d timetrace events to syslog\n", events); atomic_dec(&tt_freeze_count); atomic_set(&active, 0); } +/** + * tt_get_messages() - Print the messages from all timetrace records to a + * caller-provided buffer. Only the messages are printed (no timestamps or + * core numbers). Intended primarily for use by unit tests. + * @buffer: Where to print messages. + * @length: Number of bytes available at @buffer; output will be truncated + * if needed to fit in this space. + */ +void tt_get_messages(char *buffer, size_t length) +{ + /* Index of the next entry to return from each tt_buffer (too + * large to allocate on stack, so allocate dynamically). + */ + int *pos = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); + int printed = 0; + + *buffer = 0; + if (!init) + goto done; + atomic_inc(&tt_freeze_count); + tt_find_oldest(pos); + + /* Each iteration of this loop prints one event. */ + while (true) { + u64 earliest_time = ~0; + struct tt_event *event; + int current_core = -1; + int i, result; + + /* Check all the traces to find the earliest available event. */ + for (i = 0; i < nr_cpu_ids; i++) { + struct tt_buffer *buffer = tt_buffers[i]; + + event = &buffer->events[pos[i]]; + if (pos[i] != tt_get_buf_index(buffer) && + event->timestamp < earliest_time) { + current_core = i; + earliest_time = event->timestamp; + } + } + if (current_core < 0) { + /* None of the traces have any more events. */ + break; + } + event = &(tt_buffers[current_core]->events[pos[current_core]]); + pos[current_core] = (pos[current_core] + 1) + & (tt_buffer_size - 1); + + if (printed > 0) { + result = snprintf(buffer + printed, length - printed, + "; "); + if (result < 0 || result >= (length - printed)) + break; + printed += result; + } + result = snprintf(buffer + printed, length - printed, + event->format, event->arg0, event->arg1, + event->arg2, event->arg3); + if (result < 0 || result >= (length - printed)) + break; + printed += result; + } + + atomic_dec(&tt_freeze_count); + +done: + kfree(pos); +} + /** * tt_dbg1() - Invoked by the Linux kernel for various temporary debugging * purposes. Arguments are defined as needed for a specific situation. + * @msg: String message providing useful debugging information. */ void tt_dbg1(char *msg, ...) { + pr_err("printk is currently disabled in tt_dbg1"); + return; + // pr_err("tt_dbg1 starting\n"); + // if (atomic_read(&tt_frozen)) + // return; + // tt_freeze(); + // tt_printk(); } /** * tt_dbg2() - Invoked by the Linux kernel for various temporary debugging * purposes. Arguments are defined as needed for a specific situation. + * @msg: String message providing useful debugging information. */ void tt_dbg2(char *msg, ...) { @@ -617,6 +887,7 @@ void tt_dbg2(char *msg, ...) /** * tt_dbg3() - Invoked by the Linux kernel for various temporary debugging * purposes. Arguments are defined as needed for a specific situation. + * @msg: String message providing useful debugging information. */ void tt_dbg3(char *msg, ...) { @@ -629,8 +900,9 @@ void tt_dbg3(char *msg, ...) * to increment. * @count: Amount by which to increment to the metric. */ -void tt_inc_metric(int metric, __u64 count) +void tt_inc_metric(int metric, u64 count) { +#ifndef __STRIP__ /* See strip.py */ /* Maps from the metric argument to an offset within homa_metrics. * This level of indirection is needed so that the kernel doesn't * have to be recompiled every time a new metric gets added (which @@ -640,9 +912,37 @@ void tt_inc_metric(int metric, __u64 count) static int offsets[] = { offsetof(struct homa_metrics, napi_cycles), offsetof(struct homa_metrics, linux_softirq_cycles), + offsetof(struct homa_metrics, linux_pkt_alloc_bytes), + offsetof(struct homa_metrics, temp[0]), + offsetof(struct homa_metrics, temp[1]), + offsetof(struct homa_metrics, temp[2]), + offsetof(struct homa_metrics, temp[3]), + offsetof(struct homa_metrics, linux_softirqd_actions) }; - __u64 *metric_addr = (__u64 *)(((char *) - &homa_cores[raw_smp_processor_id()]->metrics) + u64 *metric_addr = (u64 *)(((char *)homa_metrics_per_cpu()) + offsets[metric]); *metric_addr += count; +#endif /* See strip.py */ } + +/** + * tt_record_tcp() - Create a timetrace record for a TCP packet, formatting + * data in a standard way. + * @format: Format string for tt_record4; must have % specs for + * source, dest, length, and ack/seq, in that order. + * @skb: Contains TCP packet with valid transport header. + * @saddr: Source address for packet. + * @daddr: Destination address for packet. + */ +void tt_record_tcp(char *format, struct sk_buff *skb, __be32 saddr, + __be32 daddr) +{ + struct tcphdr *th; + int data_length; + + th = (struct tcphdr*)skb_transport_header(skb); + data_length = skb->len - skb_transport_offset(skb) - th->doff * 4; + tt_record4(format, (ntohl(saddr) << 16) + ntohs(th->source), + (ntohl(daddr) << 16) + ntohs(th->dest), data_length, + data_length == 0 ? ntohl(th->ack_seq) : ntohl(th->seq)); +} \ No newline at end of file diff --git a/timetrace.h b/timetrace.h index 40589881..1796ef6e 100644 --- a/timetrace.h +++ b/timetrace.h @@ -1,78 +1,77 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ #ifndef HOMA_TIMETRACE_H #define HOMA_TIMETRACE_H #include +#ifdef __UNIT_TEST__ +#undef get_cycles +#define get_cycles mock_get_cycles +u64 mock_get_cycles(void); +#endif /* __UNIT_TEST__ */ + // Change 1 -> 0 in the following line to disable time tracing globally. // Used only in debugging. #define ENABLE_TIME_TRACE 1 -/** - * Timetrace implements a circular buffer of entries, each of which +/* Timetrace implements a circular buffer of entries, each of which * consists of a fine-grain timestamp, a short descriptive string, and * a few additional values. It's typically used to record times at - * various points in in kernel operations, in order to find performance + * various points in kernel operations, in order to find performance * bottlenecks. It can record a trace relatively efficiently (< 10ns as * of 6/2018), and the trace can be retrieved by user programs for * analysis by reading a file in /proc. */ -/** - * This structure holds one entry in a tt_buffer. +/* This structure holds one entry in a tt_buffer. */ struct tt_event { /** * Time when this event occurred (in tt_rdtsc units). */ - __u64 timestamp; + u64 timestamp; /** * Format string describing the event. NULL means that this * entry has never been occupied. */ - const char* format; + const char *format; /** * Up to 4 additional arguments that may be referenced by * @format when printing out this event. */ - __u32 arg0; - __u32 arg1; - __u32 arg2; - __u32 arg3; + u32 arg0; + u32 arg1; + u32 arg2; + u32 arg3; }; /* The number of events in a tt_buffer, as a power of 2. */ #define TT_BUF_SIZE_EXP 14 -#define TT_BUF_SIZE (1<next_index) & TT_BUF_MASK; +} + +/* Holds information about an attempt to read timetrace information * using a /proc file. Several of these can exist simultaneously. */ struct tt_proc_file { /* Identifies a particular open file. */ - struct file* file; + struct file *file; /* Index of the next entry to return from each tt_buffer. */ int pos[NR_CPUS]; @@ -105,51 +112,60 @@ struct tt_proc_file { char *next_byte; }; -extern void tt_destroy(void); -extern void tt_freeze(void); -extern int tt_init(char *proc_file, int *temp); -extern void tt_record_buf(struct tt_buffer* buffer, __u64 timestamp, - const char* format, __u32 arg0, __u32 arg1, - __u32 arg2, __u32 arg3); +void tt_destroy(void); +void tt_freeze(void); +int tt_init(char *proc_file); +void tt_record_buf(struct tt_buffer *buffer, u64 timestamp, + const char *format, u32 arg0, u32 arg1, + u32 arg2, u32 arg3); +void tt_set_temp(int *temp); +void tt_unfreeze(void); /* Private methods and variables: exposed so they can be accessed * by unit tests. */ -extern void tt_dbg1(char *msg, ...); -extern void tt_dbg2(char *msg, ...); -extern void tt_dbg3(char *msg, ...); -extern void tt_find_oldest(int *pos); -extern void tt_printk(void); -extern int tt_proc_open(struct inode *inode, struct file *file); -extern ssize_t tt_proc_read(struct file *file, char __user *user_buf, - size_t length, loff_t *offset); -extern int tt_proc_release(struct inode *inode, struct file *file); -extern loff_t tt_proc_lseek(struct file *file, loff_t offset, int whence); +void tt_dbg1(char *msg, ...); +void tt_dbg2(char *msg, ...); +void tt_dbg3(char *msg, ...); +u64 tt_find_oldest(int *pos); +void tt_get_messages(char *buffer, size_t length); +void tt_print_file(char *path); +void tt_printk(void); +int tt_proc_open(struct inode *inode, struct file *file); +ssize_t tt_proc_read(struct file *file, char __user *user_buf, + size_t length, loff_t *offset); +int tt_proc_release(struct inode *inode, struct file *file); +loff_t tt_proc_lseek(struct file *file, loff_t offset, int whence); +void tt_record_tcp(char *format, struct sk_buff *skb, __be32 saddr, + __be32 daddr); extern struct tt_buffer *tt_buffers[]; -extern int tt_buffer_size; extern atomic_t tt_freeze_count; -extern bool tt_frozen; +extern atomic_t tt_frozen; extern int tt_pf_storage; extern bool tt_test_no_khz; /* Debugging variables exposed by the version of timetrace built into * the kernel. */ -extern int64_t tt_debug_int64[100]; -extern void * tt_debug_ptr[100]; +extern s64 tt_debug_int64[100]; +extern void *tt_debug_ptr[100]; /** - * tt_rdtsc(): return the current value of the fine-grain CPU cycle counter - * (accessed via the RDTSC instruction). + * tt_get_cycles(): return the current value of the fine-grain CPU cycle + * counter. + * Return: see above. */ -static inline __u64 tt_rdtsc(void) +static inline u64 tt_get_cycles(void) { - __u32 lo, hi; - __asm__ __volatile__("rdtsc" : "=a" (lo), "=d" (hi)); - return (((__u64)hi << 32) | lo); +#ifdef __UNIT_TEST__ + extern u64 mock_tt_cycles; + return mock_tt_cycles; +#else /* __UNIT_TEST__ */ + return get_cycles(); +#endif /* __UNIT_TEST__ */ } -/** +/* * tt_recordN(): record an event, along with N parameters. * * @format: Format string for snprintf that will be used, along with @@ -166,52 +182,59 @@ static inline __u64 tt_rdtsc(void) * @arg2 Argument to use when printing a message about this event. * @arg3 Argument to use when printing a message about this event. */ -static inline void tt_record4(const char* format, __u32 arg0, __u32 arg1, - __u32 arg2, __u32 arg3) +static inline void tt_record4(const char *format, u32 arg0, u32 arg1, + u32 arg2, u32 arg3) { #if ENABLE_TIME_TRACE - tt_record_buf(tt_buffers[raw_smp_processor_id()], get_cycles(), format, - arg0, arg1, arg2, arg3); + tt_record_buf(tt_buffers[raw_smp_processor_id()], tt_get_cycles(), + format, arg0, arg1, arg2, arg3); #endif } -static inline void tt_record3(const char* format, __u32 arg0, __u32 arg1, - __u32 arg2) + +static inline void tt_record3(const char *format, u32 arg0, u32 arg1, + u32 arg2) { #if ENABLE_TIME_TRACE - tt_record_buf(tt_buffers[raw_smp_processor_id()], get_cycles(), format, - arg0, arg1, arg2, 0); + tt_record_buf(tt_buffers[raw_smp_processor_id()], tt_get_cycles(), + format, arg0, arg1, arg2, 0); #endif } -static inline void tt_record2(const char* format, __u32 arg0, __u32 arg1) + +static inline void tt_record2(const char *format, u32 arg0, u32 arg1) { #if ENABLE_TIME_TRACE - tt_record_buf(tt_buffers[raw_smp_processor_id()], get_cycles(), format, - arg0, arg1, 0, 0); + tt_record_buf(tt_buffers[raw_smp_processor_id()], tt_get_cycles(), + format, arg0, arg1, 0, 0); #endif } -static inline void tt_record1(const char* format, __u32 arg0) + +static inline void tt_record1(const char *format, u32 arg0) { #if ENABLE_TIME_TRACE - tt_record_buf(tt_buffers[raw_smp_processor_id()], get_cycles(), format, - arg0, 0, 0, 0); + tt_record_buf(tt_buffers[raw_smp_processor_id()], tt_get_cycles(), + format, arg0, 0, 0, 0); #endif } -static inline void tt_record(const char* format) + +static inline void tt_record(const char *format) { #if ENABLE_TIME_TRACE - tt_record_buf(tt_buffers[raw_smp_processor_id()], get_cycles(), format, - 0, 0, 0, 0); + tt_record_buf(tt_buffers[raw_smp_processor_id()], tt_get_cycles(), + format, 0, 0, 0, 0); #endif } -static inline __u32 tt_hi(void *p) +static inline u32 tt_hi(void *p) { - return ((__u64) p) >> 32; +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wshift-count-overflow" + return ((uintptr_t)p) >> 32; +#pragma GCC diagnostic pop } -static inline __u32 tt_lo(void *p) +static inline u32 tt_lo(void *p) { - return ((__u64) p) & 0xffffffff; + return ((uintptr_t)p) & 0xffffffff; } #endif // HOMA_TIMETRACE_H diff --git a/updateRemote b/updateRemote deleted file mode 100755 index 3a2bbd18..00000000 --- a/updateRemote +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/sh -# This script copies modified information from this directory to the machines -# given by the arguments (defaults to rcmaster if no arguments). - -if [ $# -eq 0 ]; then - targets=rcmaster.scs.stanford.edu - dir=remote/homaModule/ -else - targets=$* - dir=homaModule/ -fi - -for t in $targets; do - echo $t - rsync -rtv --exclude-from=rsync-exclude.txt ./ ouster@$t:$dir -done diff --git a/util/Makefile b/util/Makefile index 1885016c..5d79282e 100644 --- a/util/Makefile +++ b/util/Makefile @@ -2,13 +2,13 @@ CFLAGS := -Wall -Werror -fno-strict-aliasing -O3 -I.. -BINS := buffer_client buffer_server cp_node dist_test get_time_trace \ - homa_prio homa_test receive_raw scratch send_raw server \ - smi test_time_trace use_memory +BINS := buffer_client buffer_server cp_node dist_test dist_to_proto \ + get_time_trace homa_prio homa_test inc_tput receive_raw scratch \ + send_raw server smi use_memory OBJS := $(patsubst %,%.o,$(BINS)) -LIB_SRCS := dist.cc homa_api.c test_utils.cc time_trace.cc +LIB_SRCS := dist.cc test_utils.cc time_trace.cc LIB_OBJS := $(patsubst %.c,%.o,$(patsubst %.cc,%.o,$(LIB_SRCS))) LIB_OBJS += homa_receiver.o @@ -39,9 +39,6 @@ homa_receiver.o: ../homa_receiver.cc ../homa_receiver.h %.o: %.c test_utils.h ../homa.h cc -c $(CFLAGS) $< -o $@ -homa_api.o: ../homa_api.c ../homa.h - cc -c $(CFLAGS) $< -o $@ - clean: rm -f $(BINS) $(OBJS) $(LIB_OBJS) diff --git a/util/README.md b/util/README.md index a976385e..d5023130 100644 --- a/util/README.md +++ b/util/README.md @@ -35,55 +35,37 @@ truncation. ### Timetracing Tools A number of programs are available for collecting, transforming, and analyzing -timetraces. Most of these programs depend on the existence of certain -records in the timetrace. As Homa evolves, the actual timetrace records -also evolve, which can break the scripts; if you discover a broken script, -either update the script to use Homa's current timetrace records, or -change Homa to output better records (try not to break other scripts when -doing this). In addition, some of these scripts depend on timetrace records -from the main Linux kernel, outside Homa; these scripts won't work unless -you have installed my kernel modifications. +timetraces. Most have --help options that provide documentation. The following +scripts are relatively general-purpose (i.e. they don't have Homa dependencies): -**ttprint.py**: extracts the most recent timetrace from the kernel and -prints it to standard output. - -**ttcore.py**: extracts records containing certain substrings and computes how -often those records occur on each core. - -**ttgrants.py**: computes *grant lag* for a timetrace: how long it takes after a -grant is issued for the granted packet to arrive. Also computes statistics on -when grants arrive, compared to when they need to arrive to transmit at full -link speed. - -**ttgrep.py**: extracts records from a timetrace that match a pattern, and recomputes -the time differences using only those records. +**ttgrep.py**: extracts records from a timetrace that match a pattern and +recomputes the time differences using only those records. **ttmerge.py**: combines two or more timetraces into a single timetrace. -**ttnicedelay.py**:: analyzes synchronized client and server traces to -detect situations where the NIC is delaying interrupts. - **ttoffset.py**: offsets all of the times in a timetrace by a given amount (usually done to line up times in one trace with times in another). -**ttpktdelay.py**: reads client and server timetraces gathered at about the same time, -and analyzes packet delays in both directions. +**ttrange.py**: extracts timetrace entries from a given time range. -**ttrpcs.py**: scans a client or server timetrace to compute the time taken for each -phase of the RPC. +**ttsum.py**: outputs statistics from a timetrace on the delay preceding each +event. Can also produce a timeline for repeated operations such as processing +a request on a server. -**ttrange.py**: extracts timetrace entries from a given time range. +The following scripts are Homa-specific: + +**ttprint.py**: extracts the most recent timetrace from the kernel and +prints it to standard output. -**ttsoftirq.py**: analyzes SoftIRQ wakeup times in a timetrace. +**ttsync.py**: analyzes Homa-specific information in a collection of +timetraces simultaneously on different nodes and rewrites the traces to +synchronize their clocks. -**ttsum.py**: outputs statistics from a timetrace on the delay preceding each event. -Can also produce a timeline for repeated operations such as processing a request -on a server. +**tthoma.py**: this is the primary script for analyzing Homa data. It +contains multiple analyzers that extract different kinds of data from a +collection of timetraces. Invoke with --help for full documentation. -**ttsync.py**: reads client and server timetraces gathered at about the same time, -computes the clock offset between client and server, and outputs a new server -trace with its clock values offset to match the client clock. +### Other Useful Tools -**ttxmit.py**: analyzes packet transmissions from a timetrace to identify -uplink bubbles (gaps during which the uplink was idle even though there -were active outbound messages). +**diff_rtts.py**: compares two .rtts files collected by the cperf benchmarks, +tries to identify how/why they are different. \ No newline at end of file diff --git a/util/avg.py b/util/avg.py new file mode 100755 index 00000000..c3126986 --- /dev/null +++ b/util/avg.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python + +""" +Reads lines and extracts the first floating-point number to appear on +each line; prints both the individual values and the average of them. +Usage: avg.py [file] +""" + +from __future__ import division, print_function +from glob import glob +from optparse import OptionParser +import math +import os +import re +import string +import sys + +if len(sys.argv) == 2: + f = open(sys.argv[1]) +elif len(sys.argv) == 1: + f = sys.stdin +else: + print("Usage: %s [tt_file]" % (sys.argv[0])) + sys.exit(1) + +values = [] + +for line in f: + match = re.match('.*?[^0-9]([0-9]+[.][0-9]+)', line) + if match: + print('Found field %s' % (match.group(1))) + values.append(float(match.group(1))) + else: + print('Line didn\'t match: %s' % (line)) + +if len(values): + print('Average: %.3f' % (sum(values)/len(values))) +else: + print('No lines matched') \ No newline at end of file diff --git a/util/buffer_client.c b/util/buffer_client.c index 20d809e8..6b90ca25 100644 --- a/util/buffer_client.c +++ b/util/buffer_client.c @@ -1,16 +1,5 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2019-2022 Homa Developers + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This is a test program used together with buffer_server.c to learn about diff --git a/util/buffer_server.c b/util/buffer_server.c index 17b1f89e..e5bd4029 100644 --- a/util/buffer_server.c +++ b/util/buffer_server.c @@ -1,16 +1,5 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2019-2022 Homa Developers + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This is a test program used together with buffer_client.c to learn about @@ -32,11 +21,12 @@ #include #include "homa.h" +#include "test_utils.h" int main(int argc, char** argv) { int fd, port; int optval = 1; - sockaddr_in_union bindAddress; + union sockaddr_in_union bindAddress; if (argc < 2) { printf("Usage: %s port\n", argv[0]); diff --git a/util/cp_basic b/util/cp_basic index 8b9474e0..ae41548e 100755 --- a/util/cp_basic +++ b/util/cp_basic @@ -1,18 +1,7 @@ #!/usr/bin/python3 -# Copyright (c) 2020-2023 Stanford University -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2020-2023 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark computes basic latency and throughput numbers # for Homa and TCP. diff --git a/util/cp_both b/util/cp_both new file mode 100755 index 00000000..c7dad50e --- /dev/null +++ b/util/cp_both @@ -0,0 +1,91 @@ +#!/usr/bin/python3 + +# Copyright (c) 2024 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +# This cperf benchmark runs both TCP and Homa on each client and server +# node in order to measure interference between the protocols. +# Type "cp_both --help" for documentation. + +from cperf import * + +parser = get_parser(description= + 'Measures slowdown when TCP and Homa are competing for resources ' + 'on the same nodes.', + usage='%(prog)s [options]', defaults={'homa_gbps': 0}) +parser.add_argument('--homa-gbps', type=float, dest='homa_gbps', + metavar='B', default=None, + help='Configure Homa to generate B Gbps of total outgoing bandwidth ' + 'on each node (clients and servers combined); the remainder of ' + '--gbps will be generated by TCP (default: split --gbps between ' + 'Homa and TCP)') +options = parser.parse_args() +init(options) + +# First, run the experiment +if not options.plot_only: + homa_options = copy.deepcopy(options) + homa_options.name = "homa_" + options.workload + homa_options.protocol = "homa" + + tcp_options = copy.deepcopy(options) + tcp_options.name = "tcp_" + options.workload + tcp_options.protocol = "tcp" + + if options.homa_gbps == None: + options.homa_gbps = options.gbps/2.0 + tcp_options.gbps = (options.gbps - options.homa_gbps)/2 + if tcp_options.gbps < 0: + tcp_options.gbps = 0 + homa_options.gbps = options.gbps/2 - tcp_options.gbps + try: + run_experiments(homa_options, tcp_options) + except Exception as e: + log(traceback.format_exc()) + log("Stopping nodes") + stop_nodes() + scan_logs() + +# Generate plots and reports +homa_exp = "homa_" + options.workload +scan_metrics(homa_exp) +tcp_exp = "tcp_" + options.workload +scan_metrics(tcp_exp) + +# Generate slowdown plot. +log("Generating slowdown plot for %s" % (options.workload)) +title = "TCP (%.1f Gbps) and Homa (%.1f Gbps) together, %s, %d %s nodes" % ( + options.gbps - options.homa_gbps, options.homa_gbps, + options.workload.capitalize(), options.num_nodes, get_node_type()) +ax = start_plot_vs_msg_length(title, 1000, homa_exp) +plot_slowdown(ax, tcp_exp, "p99", "TCP P99", color=tcp_color) +plot_slowdown(ax, tcp_exp, "p50", "TCP P50", color=tcp_color2) +plot_slowdown(ax, homa_exp, "p99", "Homa P99", color=homa_color) +plot_slowdown(ax, homa_exp, "p50", "Homa P50", color=homa_color2) +ax.legend(loc="upper right", prop={'size': 9}) +plt.tight_layout() +plt.savefig("%s/reports/both_%s.pdf" % (options.log_dir, options.workload)) + +# Generate latency plot. +log("Generating RTT latency plot for %s" % (options.workload)) +ax = start_plot_vs_msg_length(title, [10, 10000], homa_exp, + y_label=r'RTT (µsec)') +plot_histogram(ax, tcp_exp, "p99", "TCP P99", color=tcp_color) +plot_histogram(ax, tcp_exp, "p50", "TCP P50", color=tcp_color2) +plot_histogram(ax, homa_exp, "p99", "Homa P99", color=homa_color) +plot_histogram(ax, homa_exp, "p50", "Homa P50", color=homa_color2) +ax.legend(loc="upper left", prop={'size': 9}) +plt.tight_layout() +plt.savefig("%s/reports/rtt_%s.pdf" % (options.log_dir, options.workload)) + +# Generate CDF of small-message RTTs. +log("Generating short message CDF for %s" % (options.workload)) +homa_x, homa_y = get_short_cdf(homa_exp) +tcp_x, tcp_y = get_short_cdf(tcp_exp) +start_cdf_plot(title, 10, 0.99e05, 1e-05, "RTT (usecs)", + "Cumulative Fraction Short Messages") +plt.plot(tcp_x, tcp_y, label="TCP", color=tcp_color) +plt.plot(homa_x, homa_y, label="Homa", color=homa_color) +plt.legend(loc="upper right", prop={'size': 9}) +plt.savefig("%s/reports/short_cdf_%s.pdf" % (options.log_dir, + options.workload)) diff --git a/util/cp_buffers b/util/cp_buffers new file mode 100755 index 00000000..6af75e88 --- /dev/null +++ b/util/cp_buffers @@ -0,0 +1,172 @@ +#!/usr/bin/python3 -u + +# Copyright (c) 2023 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +# This cperf benchmark varies one or more aspects of Homa's configuration, +# similar and measures Homa slowdown, but it also measures switch buffer +# utilization for each configuration, including the maximum utilization +# (if there is no limit), and the minimum buffer space that can be supported +# without significant performance degradation. +# Type "cp_config_buf --help" for documentation. + +from cperf import * +from switch import * + +# Workloads, bandwidths, and running times to use by default. +load_info = [["w3", 14, 10], ["w4", 20, 20], ["w5", 20, 30]] + +parser = get_parser(description= + 'Measures performance as the available switch buffer space is restricted.', + usage='%(prog)s [options]', defaults={'client_max': 400}) + +options = parser.parse_args() +options.alt_slowdown = True +init(options) + +if options.workload != "": + load_info = [[options.workload, options.gbps, options.seconds]] + +# Total downlink bandwidth across all nodes, in MBytes/sec +mBps = float(get_sysctl_parameter(".net.homa.link_mbps")) * options.num_nodes/8 + +# Used to collect data from experiments; a list with one value for each +# experiment. The value is a dictionary containing various information +# about the experiment. +data = [] + +if options.protocol == "homa": + pcts = [90, 80, 75, 70, 65, 60, 55, 50, 45, 40, 35, 30] +elif options.protocol == "tcp": + pcts = [90, 80, 70, 60, 50, 40, 30, 20] + set_sysctl_parameter('.net.ipv4.tcp_congestion_control', 'cubic', + options.nodes) +elif options.protocol == "dctcp": + pcts = [50, 40, 30, 25, 20, 15, 10] + set_sysctl_parameter('.net.ipv4.tcp_congestion_control', 'dctcp', + options.nodes) +pcts = [90, 80, 70, 60, 50, 40, 30, 25, 20, 15, 10] + +switch = Switch() +if not options.plot_only: + try: + + for workload, bw, seconds in load_info: + options.workload = workload + options.seconds = seconds + options.gbps = bw/2.0 + + # Measure unloaded performance. + o = copy.deepcopy(options) + o.protocol = "homa" + o.client_ports = 1 + o.client_max = 1 + o.server_ports = 1 + o.servers = options.nodes[1:2] + o.unloaded = 500 + exp_name = "unloaded_" + workload + start_servers(o.servers, o) + run_experiment(exp_name, o.clients[0:1], o) + set_unloaded(exp_name) + + start_servers(options.servers, options) + + # Run the experiment once with no buffer restrictions to get + # a baseline slowdown and buffer utilization. + switch.clear_max_buffer_usage() + switch.set_buffer_limit(13.2) + exp_name = "%s_%s_100" % (options.protocol, workload) + run_experiment(exp_name, options.clients, options) + digest = get_digest(exp_name) + base_slowdown = digest["avg_slowdown"] + base_buf = switch.get_max_buffer_usage() + log("Baseline for %s: slowdown %.1f, buffer space %.1f MB" + % (exp_name, base_slowdown, base_buf)) + data.append({ + "name": exp_name, + "workload": workload, + "pct": 100, + "buf_size": base_buf, + "slowdown": base_slowdown + }) + + # Restrict the buffer size in increments of 10% of the + # baseline usage. + for pct in pcts: + buf = base_buf*pct/100 + switch.set_buffer_limit(buf) + log("Trying %.3f MB buffer limit (%d%% of baseline)" + % (buf, pct)) + exp = "%s_%d" % (exp_name, pct) + run_experiment(exp, options.clients, options) + slowdown = get_digest(exp)["avg_slowdown"] + log("Slowdown for %s with %.3f MB buffer limit: %.1f" + % (exp, buf, slowdown)) + data.append({ + "name": exp, + "workload": workload, + "pct": pct, + "buf_size": buf, + "slowdown": slowdown + }) + if (slowdown > 5*base_slowdown): + break; + + except Exception as e: + log(traceback.format_exc()) + + f = open("%s/reports/%s_slowdown.data" % (options.log_dir, options.protocol), + "w") + print("# This file was created by cp_buffers at %s" + % date_time, file=f) + for name in sorted(dir(options)): + if name.startswith("_"): + continue + print("# options.%s: %*s %s" % (name, 20 - len(name), "", + getattr(options, name)), file=f) + log("\nSlowdown vs. Switch Buffer Space:") + print("\n# Slowdown as a function of switch buffer space for %s:" + % (options.protocol), file=f) + print("# exp: Name of the experiment", file=f) + print("# prot: Transport protocol", file=f) + print("# workload: w1-w5", file=f) + print("# buf_mb: Buffer space limit for experiment (MBytes)", file=f) + print("# pct: Buffer space limit as percentage of buf usage when", file=f) + print("# unrestricted", file=f) + print("# slowdown: Average slowdown across all RPCs", file=f) + log("Experiment Buffer MB Pct Slowdown") + print("%-20s prot workload buf_mb pct slowdown" % ("exp"), file = f) + for exp in data: + log("%-20s %6.1f %5d %6.1f" % ( + exp["name"], exp["buf_size"], exp["pct"], + exp["slowdown"])) + print("%-20s %6s %6s %6.1f %5d %6.1f" % ( + exp["name"], options.protocol, exp["workload"], + exp["buf_size"], exp["pct"], exp["slowdown"]), file=f) + log("") + f.close() + + log("Stopping nodes") + stop_nodes() + scan_logs() + +log("Resetting switch buffer limit") +switch.set_buffer_limit(13.2) +switch.close() + +# Generate plots and reports +for workload, bw, seconds in load_info: + set_unloaded("unloaded_" + workload) + + # Generate slowdown plot. + log("Generating slowdown plot for %s" % (workload)) + title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), + options.num_nodes, bw) + exp_name = "%s_%s" % (options.protocol, workload) + ax = start_plot_vs_msg_length(title, 1000, exp_name, y_label=" Slowdown") + for exp in data: + plot_slowdown(ax, exp["name"], "p99", "%.1f MB P99" % exp["buf_size"]) + ax.legend(loc="upper left", prop={'size': 9}) + plt.tight_layout() + plt.savefig("%s/reports/%s_%s.pdf" % + (options.log_dir, options.protocol, workload)) diff --git a/util/cp_client_threads b/util/cp_client_threads index 44576a8e..15291bf3 100755 --- a/util/cp_client_threads +++ b/util/cp_client_threads @@ -1,18 +1,7 @@ #!/usr/bin/python3 -# Copyright (c) 2020-2023 Stanford University -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2020-2023 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark measures the throughput of a single client as a # function of the number of sending threads diff --git a/util/cp_config b/util/cp_config index 79865921..0d56ca88 100755 --- a/util/cp_config +++ b/util/cp_config @@ -1,201 +1,354 @@ #!/usr/bin/python3 -# Copyright (c) 2020-2022 Stanford University -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2020-2026 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark measures Homa slowdown while varying one or more # aspects of Homa's configuration (such as duty cycle). # Type "cp_config --help" for documentation. from cperf import * +from switch import * -load_info = [["w2", 3.2], ["w3", 14], ["w4", 20], ["w5", 20]] +load_info = [["w2", 3.2, 5], ["w3", 14, 10], ["w4", 20, 20], ["w5", 20, 30]] parser = get_parser(description= - 'Measures Homa slowdown as the number of available priority levels ' - 'varies.', + 'Measures slowdown as the configuraton is changed in various ways.', usage='%(prog)s [options]') parser.add_argument('-c', '--config', dest='config', - choices=['duty_cycle', 'fifo', 'gro', 'max_gro', 'max_gso', - 'nic_queue', 'poll', 'ports', 'prios', 'receivers', - 'rtt_bytes', 'throttle'], + choices=['balance', 'buffers', 'busy_usecs', 'client_threads', + 'dctcp_buffers', 'defer_min_bytes', 'fifo', 'gbps', + 'gen2', 'gen3', 'grant_policy', 'gro_busy_usecs', 'load', + 'max_gro', 'max_gso', 'max_nic_queue', 'mtu', 'nic_backlog', + 'poll', 'ports', 'prios', 'receivers', 'repeat', + 'tcp_buffers', 'throttle', 'time', 'unsched_bytes'], required = True, help='Aspect of configuration to change') options = parser.parse_args() init(options) -servers = range(0, options.num_nodes) -clients = range(0, options.num_nodes) if options.workload != "": - load_info = [[options.workload, options.gbps]] + load_info = [[options.workload, options.gbps, options.seconds]] +plot_max_y = 1000 specs = [] -if options.config == 'duty_cycle': - # Vary the duty_cycle configuration parameter - for duty in [100, 80, 60, 40]: - specs.append({'param': '.net.homa.duty_cycle', - 'value': duty*10, - 'exp_name': 'duty_%d' % (duty), - 'label': '%d%% duty cycle' % (duty)}) +if options.config == 'balance': + # Vary the load balancing policy + specs.append({'exp_name': 'gen2default', + 'label': 'Gen2 Default', + 'lb': 'xl170_default' + }) + specs.append({'exp_name': 'gen2', + 'label': 'Gen2', + 'lb': 'gen2' + }) + specs.append({'exp_name': 'gen3', + 'label': 'Gen3', + 'lb': 'gen3' + }) + specs.append({'exp_name': 'gen3_alt', + 'label': 'Gen3 Alt', + 'lb': 'gen3_alt' + }) +elif options.config == 'buffers': + # Vary the amount of buffer space in the switch + if not options.workload: + load_info = [["w3", 14, 10], ["w4", 20, 20], ["w5", 20, 60]] + # Specs will be filled in below (they are workload dependent). + for mb in [10, 2.5, 2]: + specs.append({'exp_name': 'bufs_%.1fM' % (mb), + 'label': '%.1f MB' % (mb), + 'switch_buffer': mb}) +elif options.config == 'busy_usecs': + # Vary the time threshold for considering a core busy and trying + # to schedule work elsewhere + for usecs in [0, 10, 20, 50, 100, 200]: + specs.append({'exp_name': 'busy_%d' % (usecs), + 'label': 'busy_usecs %d' % (usecs), + 'sysctl': ['.net.homa.busy_usecs', usecs] + }) +elif options.config == 'client_threads': + # Vary the client thread configuration + for ports, receivers in [[3, 2], [3, 3], [3, 4], [2, 3], [2, 4], [1, 5]]: + name = "%d ports %d rcvrs" % (ports, receivers) + specs.append({'exp_name': "p%dr%d" % (ports, receivers), + 'label': "%d ports %d rcvrs" % (ports, receivers), + 'options': ['client_ports', ports, + 'port_receivers', receivers] + }) +elif options.config == 'dctcp_buffers': + # Use DCTCP, vary the amount of buffer space in the switch + if not options.workload: + load_info = [["w3", 14, 10], ["w4", 20, 20], ["w5", 20, 60]] + plot_max_y = 10000 + for mb in [10, 1.2, 0.7]: + specs.append({'exp_name': 'bufs_%.1fM' % (mb), + 'label': '%.1f MB' % (mb), + 'options': ['protocol', 'dctcp'], + 'sysctl': ['.net.ipv4.tcp_congestion_control', 'dctcp'], + 'switch_buffer': mb}) +elif options.config == 'defer_min_bytes': + # Vary the fraction of bandwidth reserved for the oldest message + for value in [1000, 3000, 10000]: + specs.append({'exp_name': 'defer_%d' % (value), + 'label': 'defer_min_bytes %d' % (value), + 'sysctl': ['.net.homa.defer_min_bytes', value]}) elif options.config == 'fifo': # Vary the fraction of bandwidth reserved for the oldest message for fifo in [0, 5, 10, 20]: - specs.append({'param': '.net.homa.grant_fifo_fraction', - 'value': fifo*10, - 'param2': '.net.homa.pacer_fifo_fraction', - 'value2': fifo*10, - 'exp_name': 'fifo_%d' % (fifo), - 'label': '%d%% FIFO' % (fifo)}) + specs.append({'exp_name': 'fifo_%d' % (fifo), + 'label': '%d%% FIFO' % (fifo), + 'sysctl': ['.net.homa.grant_fifo_fraction', fifo*10, + '.net.homa.pacer_fifo_fraction', fifo*10]}) +elif options.config == 'gbps': + # Vary the assumed link speed to increase headroom for the pacer + # to avoid overcommitting the uplink. + for gbps in [26, 25, 24, 23]: + specs.append({'exp_name': 'gpbs_%d' % (gbps), + 'label': '%d Gbps' % (gbps), + 'sysctl': ['.net.homa.link_mbps', gbps*1000] + }) +elif options.config == 'gen2': + # Try multiple variants of the Gen2 load balancing policy + specs.append({'exp_name': 'gen2default', + 'label': 'Gen2 Default', + 'sysctl': ['.net.homa.gro_policy', 82], + 'rss': 'xl170_default' + }) + specs.append({'exp_name': 'gen2', + 'label': 'Gen2', + 'sysctl': ['.net.homa.gro_policy', 16], + 'rss': 'gen2' + }) + specs.append({'exp_name': 'gen2bypass', + 'label': 'Gen2+Bypass', + 'sysctl': ['.net.homa.gro_policy', 82], + 'rss': 'gen2' + }) +elif options.config == 'gen3': + # Try multiple variants of the Gen3 load balancing policy + specs.append({'exp_name': 'gen3', + 'label': 'Gen3', + 'sysctl': ['.net.homa.gro_policy', 128+66], + 'rss': 'gen3' + }) + specs.append({'exp_name': 'gen3nobypass', + 'label': 'Gen3-Bypass', + 'sysctl': ['.net.homa.gro_policy', 128+66], + 'rss': 'gen3' + }) + specs.append({'exp_name': 'gen3noshortbypass', + 'label': 'Gen3-ShortBypass', + 'sysctl': ['.net.homa.gro_policy', 128+64], + 'rss': 'gen3' + }) + specs.append({'exp_name': 'gen3_alt', + 'label': 'Gen3 Alt', + 'sysctl': ['.net.homa.gro_policy', 128+66], + 'rss': 'gen3_alt' + }) +elif options.config == 'grant_policy': + # Try different ways of computing grant sizes (window and max_incoming) + specs.append({'exp_name': 'default', + 'label': 'default'}) + for max_incoming in [500]: + specs.append({'exp_name': 'max_%d' % (max_incoming), + 'label': 'max_incoming %dK' % (max_incoming), + 'sysctl': ['.net.homa.window', 0, + '.net.homa.max_incoming', max_incoming*1000]}) elif options.config == 'gro': # Vary the GRO policy - for value, name in [[0, 'none'], [6, 'normal'], [4, 'idle'], [8, 'next']]: - specs.append({'param': '.net.homa.gro_policy', - 'value': value, - 'exp_name': name, - 'label': name}) + specs.append({'exp_name': 'gen2default', + 'label': 'Gen2 Default', + 'sysctl': ['.net.homa.gro_policy', 82], + 'rss': 'xl170_default' + }) +elif options.config == 'load': + # Vary the network utilization. + for gbps in [30, 40, 50, 60, 70]: + name = "gbps%d" % (gbps) + specs.append({'exp_name': name, + 'label': '%d Gbps' % (gbps), + 'options': ['gbps', gbps/2] + }) elif options.config == 'max_gro': # Vary the number of skbs processed at once by GRO before forwarding # to SoftIRQ for count in [5, 10, 20, 100]: - specs.append({'param': '.net.homa.max_gro_skbs', - 'value': count, - 'exp_name': 'max_gro_%d' % (count), - 'label': 'max_gro_skbs %d' % (count)}) + specs.append({'exp_name': 'max_gro_%d' % (count), + 'label': 'max_gro_skbs %d' % (count), + 'sysctl': ['.net.homa.max_gro_skbs', count] + }) elif options.config == 'max_gso': # Vary the max_gso_size configuration parameter for count in [5000, 10000, 20000, 50000, 100000]: - specs.append({'param': '.net.homa.max_gso_size', - 'value': count, - 'exp_name': 'max_gso_%d' % (count), - 'label': 'max_gso_size %d' % (count)}) -elif options.config == 'nic_queue': + specs.append({'exp_name': 'max_gso_%d' % (count), + 'label': 'max_gso_size %d' % (count), + 'sysctl': ['.net.homa.max_gso_size', count] + }) +elif options.config == 'max_nic_queue': + # Vary the limit on length of any individual NIC queue + for usecs in [5, 10, 20, 40]: + specs.append({'exp_name': 'nicq_%d' % (usecs), + 'label': 'max_nic_queue_usecs %d' % (usecs), + 'sysctl': ['.net.homa.max_nic_queue_usecs', usecs]}) +elif options.config == 'mtu': + # Vary the maximum packet size + for length in [1500, 3000, 5000, 7000, 9000]: + specs.append({'exp_name': 'mtu_%d' % (length), + 'label': 'MTU %d' % (length), + 'mtu': length + }) +elif options.config == 'nic_backlog': # Vary the limit on an NIC queue length - for micros in [1, 2, 5, 10, 10000]: - specs.append({'param': '.net.homa.max_nic_queue_ns', - 'value': micros*1000, - 'exp_name': 'nic_%d' % (micros), - 'label': 'nic queue %d us' % (micros)}) + for micros in [5, 10, 20, 100, 10000]: + specs.append({'exp_name': 'nic_%d' % (micros), + 'label': 'nic queue %d us' % (micros), + 'sysctl': ['.net.homa.max_nic_est_backlog_usecs', micros] + }) elif options.config == 'poll': # Vary the polling interval for poll in [0, 20, 30, 40, 50]: - specs.append({'param': '.net.homa.poll_usecs', - 'value': poll, - 'exp_name': 'poll_%d' % (poll), - 'label': 'poll %d us' % (poll)}) + specs.append({'exp_name': 'poll_%d' % (poll), + 'label': 'poll %d us' % (poll), + 'sysctl': ['.net.homa.poll_usecs', poll] + }) elif options.config == 'ports': # Vary the numbers of server and client ports for client, server in [[2, 2], [2, 3], [2, 4], [3, 3], [3, 2], [4, 2]]: - o = copy.deepcopy(options) - o.server_ports = server - o.client_ports = client name = "s%dc%d" % (server, client) - specs.append({'options': o, 'exp_name': name, 'label': name}) + specs.append({'exp_name': name, + 'label': name, + 'options': ['server_ports', server, + 'client_ports', client] + }) elif options.config == 'prios': # Vary the number of available priority levels - for priority in [1, 2, 3, 4, 8]: - specs.append({'param': '.net.homa.num_priorities', - 'value': priority, - 'exp_name': 'prios_%d' % (priority), - 'label': '%d prios' % (priority)}) + for priority in [1, 2, 3, 8]: + specs.append({'exp_name': 'prios_%d' % (priority), + 'label': '%d prios' % (priority), + 'sysctl': ['.net.homa.num_priorities', priority]}) elif options.config == 'receivers': - # Vary the client thread configuration - for ports, receivers in [[3, 2], [3, 3], [3, 4], [2, 3], [2, 4], [1, 5]]: - o = copy.deepcopy(options) - o.client_ports = ports - o.port_receivers = receivers - name = "%d ports %d rcvrs" % (ports, receivers) - specs.append({'options': o, 'exp_name': "p%dr%d" % (ports, receivers), - 'label': "%d ports %d rcvrs" % (ports, receivers)}) -elif options.config == 'rtt_bytes': - # Vary rtt_bytes - for rtt in [40000, 60000, 70000, 100000]: - specs.append({'param': '.net.homa.rtt_bytes', - 'value': rtt, - 'exp_name': 'rttb_%d' % (rtt), - 'label': 'rtt_bytes %d' % (rtt)}) + # Vary the numbers of receiving threads per port on both client and server + for threads in [2, 3, 4, 5]: + specs.append({'exp_name': 'rcvrs_%d' % (threads), + 'label': '%d rcv threads' % (threads), + 'options': ['port_receivers', threads, + 'port_threads', threads] + }) +elif options.config == 'repeat': + # Runs the same test multiple times to test repeatability; must setup + # the configuration by hand before running experiment. + for i in range(1,6): + specs.append({'exp_name': 'run%d' % (i), + 'label': 'Run %d' % (i) + }) +elif options.config == 'tcp_buffers': + # Use TCP, vary the amount of buffer space in the switch + if not options.workload: + load_info = [["w3", 14, 10], ["w4", 20, 20], ["w5", 20, 60]] + plot_max_y = 10000 + for mb in [13.2, 8, 3]: + specs.append({'exp_name': 'bufs_%.1fM' % (mb), + 'label': '%.1f MB' % (mb), + 'options': ['protocol', 'tcp'], + 'sysctl': ['.net.ipv4.tcp_congestion_control', 'cubic'], + 'switch_buffer': mb}) elif options.config == 'throttle': # Vary the cuttoff for short messages that bypass the throttle mechanism - for cutoff in [100, 200, 500, 1000, 2000]: - specs.append({'param': '.net.homa.throttle_min_bytes', - 'value': cutoff, - 'exp_name': 'throttle_%d' % (cutoff), - 'label': 'throttle_min_bytes %d' % (cutoff)}) + for cutoff in [100, 200, 500, 1000]: + specs.append({'exp_name': 'throttle_%d' % (cutoff), + 'label': 'throttle_min_bytes %d' % (cutoff), + 'sysctl': ['.net.homa.throttle_min_bytes', cutoff] + }) +elif options.config == 'time': + # Vary the experiment running time + for seconds in [10, 20, 30, 50, 100]: + specs.append({'exp_name': "secs%d" % (seconds), + 'label': "%d seconds" % (seconds), + 'options': ['seconds', seconds] + }) +elif options.config == 'unsched_bytes': + # Vary unsched_bytes + for unsched in [40, 60, 80, 100, 120]: + specs.append({'exp_name': 'unsched_%dk' % (unsched), + 'label': 'unsched_bytes %dk' % (unsched), + 'sysctl': ['.net.homa.unsched_bytes', unsched*1000] + }) # Keys are parameter names, values are old values to restore. old_values = {} +switch = None if not options.plot_only: try: - # Start servers and measure unloaded latencies. - start_servers(servers, options) - # For each workload, run a set of experiments with a different # configurations. - for workload, bw in load_info: - o = copy.deepcopy(options) - o.workload = workload - o.client_ports = 1 - o.client_max = 1 - o.server_ports = 1 - o.server_nodes = 1 - o.first_server = 1 - o.unloaded = 500 - run_experiment("unloaded_" + workload, range(0, 1), o) + for workload, bw, seconds in load_info: + if options.config == 'buffers': + if workload == "w5": + mbs = [10, 2.5, 2] + else: + mbs = [10, 2.0, 1] + specs = [] + for mb in mbs: + specs.append({'exp_name': 'bufs_%.1fM' % (mb), + 'label': '%.1f MB' % (mb), + 'switch_buffer': mb}) for spec in specs: - o = options - exp_name = "%s_%s" % (spec['exp_name'], workload) - if 'param' in spec: - name = spec['param'] - value = spec['value'] - if name not in old_values: - old_values[name] = get_sysctl_parameter(name) - log("Setting %s = %s" % (name, value)) - set_sysctl_parameter(name, value, - range(0, options.num_nodes)) - if 'param2' in spec: - name = spec['param2'] - value = spec['value2'] - if name not in old_values: - old_values[name] = get_sysctl_parameter(name) - log("Setting %s = %s" % (name, value)) - set_sysctl_parameter(name, value, - range(0, options.num_nodes)) - if 'options' in spec: - o = spec['options'] + o = copy.deepcopy(options) o.workload = workload o.gbps = bw/2.0 - start_servers(servers, o) - run_experiment(exp_name, clients, o) + o.seconds = seconds + exp_name = "%s_%s" % (spec['exp_name'], workload) + if 'sysctl' in spec: + for i in range(0, len(spec['sysctl']), 2): + name = spec['sysctl'][i] + value = spec['sysctl'][i+1] + if name not in old_values: + old_values[name] = get_sysctl_parameter(name, + options.nodes[0]) + log("Setting %s = %s" % (name, value)) + set_sysctl_parameter(name, value, options.nodes) + if 'options' in spec: + for i in range(0, len(spec['options']), 2): + name = spec['options'][i] + value = spec['options'][i+1] + setattr(o, name, value) + if 'switch_buffer' in spec: + if not switch: + switch = Switch() + mb = spec['switch_buffer'] + log("Setting buffer limit to %.1f MB" % (mb)) + switch.set_buffer_limit(mb) + if 'mtu' in spec: + do_ssh(["config", "mtu", str(spec['mtu'])], options.nodes) + if 'lb' in spec: + do_ssh(["config", "lb", spec['lb']], options.nodes) + start_servers(exp_name, o.servers, o) + run_experiment(exp_name, o.clients, o) except Exception as e: log(traceback.format_exc()) for name, value in old_values.items(): print("Restoring %s to %s" % (name, value)) - set_sysctl_parameter(name, value, range(0, options.num_nodes)) + set_sysctl_parameter(name, value, options.nodes) log("Stopping nodes") stop_nodes() scan_logs() +if switch: + log("Resetting buffer limit to 13.2 MB") + switch.set_buffer_limit(13.2) + switch.close + switch = None # Generate plots and reports -for workload, bw in load_info: - set_unloaded("unloaded_" + workload) - +for workload, bw, seconds in load_info: # Generate slowdown plot. log("Generating slowdown plot for %s" % (workload)) - title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), - options.num_nodes, bw) - ax = start_slowdown_plot(title, 1000, "%s_%s" % ( + title = "%s, %d %s nodes, %.1f Gbps" % (workload.capitalize(), + options.num_nodes, get_node_type(), bw) + ax = start_plot_vs_msg_length(title, plot_max_y, "%s_%s" % ( specs[0]['exp_name'], workload), y_label=" Slowdown") for spec in specs: exp_name = "%s_%s" % (spec['exp_name'], workload) @@ -211,17 +364,33 @@ for workload, bw in load_info: plt.savefig("%s/reports/%s_%s.pdf" % (options.log_dir, options.config, workload)) + # Generate latency plot. + log("Generating latency plot for %s" % (workload)) + title = "%s, %d %s nodes, %.1f Gbps" % (workload.capitalize(), + options.num_nodes, get_node_type(), bw) + ax = start_plot_vs_msg_length(title, [10, 10000], "%s_%s" % ( + specs[0]['exp_name'], workload), y_label=r'RTT (µsec)') + for spec in specs: + exp_name = "%s_%s" % (spec['exp_name'], workload) + plot_histogram(ax, exp_name, "p99", spec['label'] + ' P99') + for spec in specs: + exp_name = "%s_%s" % (spec['exp_name'], workload) + plot_histogram(ax, exp_name, "p50", spec['label'] + ' P50') + ax.legend(loc="upper left", prop={'size': 9}) + plt.tight_layout() + plt.savefig("%s/reports/%s_%s_rtt.pdf" % + (options.log_dir, options.config, workload)) + # Generate CDF of small-message RTTs. log("Generating short message CDFs for %s" % (workload)) - title = "%s %d nodes" % (workload.capitalize(), options.num_nodes) + title = "%s, %d %s nodes" % (workload.capitalize(), options.num_nodes, + get_node_type()) start_cdf_plot(title, 10, 0.99e05, 1e-05, "RTT (usecs)", "Cumulative Fraction of Short Messages") for spec in specs: exp_name = "%s_%s" % (spec['exp_name'], workload) x, y = get_short_cdf(exp_name) plt.plot(x, y, label=spec['label']) - x, y = get_short_cdf("unloaded_" + workload) - plt.plot(x, y, label="Homa best case") plt.legend(loc="upper right", prop={'size': 9}) plt.savefig("%s/reports/%s_%s_cdfs.pdf" % diff --git a/util/cp_config_buf b/util/cp_config_buf new file mode 100755 index 00000000..7ca16dbe --- /dev/null +++ b/util/cp_config_buf @@ -0,0 +1,371 @@ +#!/usr/bin/python3 -u + +# Copyright (c) 2023 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +# This cperf benchmark varies one or more aspects of Homa's configuration, +# similar and measures Homa slowdown, but it also measures switch buffer +# utilization for each configuration, including the maximum utilization +# (if there is no limit), and the minimum buffer space that can be supported +# without significant performance degradation. +# Type "cp_config_buf --help" for documentation. + + +from cperf import * +from switch import * + +# Workloads, bandwidths, and running times to use by default. +load_info = [["w2", 3.2, 5], ["w3", 14, 10], ["w4", 20, 20], ["w5", 20, 30]] + +parser = get_parser(description= + 'Measures Homa slowdown and buffer utilization in TOR egress ports ' + "while varying Homa's configuration.", + usage='%(prog)s [options]') +parser.add_argument('-c', '--config', dest='config', + choices=['dctcp_link_util', + 'dctcp_nodes', + 'dctcp_threshold', + 'dynamic_windows', + 'link_util', + 'nodes', + 'overcommit', + 'tcp_link_util', + 'tcp_nodes', + 'unsched_bytes'], + required = True, + help='Aspect of configuration to vary') +parser.add_argument('--report', dest='report', + metavar='F', default='buf_usage.data', + help='Name of the file to use for the buffer usage report in the log ' + '"reports" directory (default: buf_usage.data)') +options = parser.parse_args() +init(options) + +if options.workload != "": + load_info = [[options.workload, options.gbps, options.seconds]] + +plot_max_y = 1000 +specs = [] +if options.config == 'dctcp_link_util': + # Measure DCTCP while varying link utilization + options.protocol = "dctcp" + for gbps in [5, 10, 12, 14, 16, 18, 20]: + specs.append({'exp_name': 'dctcp_util_%d' % (gbps), + 'label': 'DCTCP link utilization %d Gbps' % (gbps), + 'options': ['gbps', gbps/2.0], + 'sysctl': ['.net.ipv4.tcp_congestion_control', 'dctcp'], + 'value': gbps + }) +elif options.config == 'dctcp_nodes': + # Measure DCTCP while varying the number of nodes + if not options.workload: + load_info = [["w3", 14, 10], ["w4", 20, 20], ["w5", 20, 30]] + options.protocol = "dctcp" + for nodes in [5, 10, 15, 20, 25, 30]: + if nodes > options.num_nodes: + print("Can't run experiment with %d nodes: --nodes is only %d" + % (nodes, options.num_nodes)) + continue + specs.append({'exp_name': 'dctcp_nodes_%d' % (nodes), + 'label': '%d nodes' % (nodes), + 'options': ['num_nodes', nodes], + 'sysctl': ['.net.ipv4.tcp_congestion_control', 'dctcp'], + 'value': nodes + }) +elif options.config == 'dctcp_threshold': + # Measure DCTCP while varying the ECN marking threshold + options.protocol = "dctcp" + for thresh in [50, 78, 100, 150, 200, 500, 1000, 2000]: + specs.append({'exp_name': 'dctcp_threshold_%dk' % (thresh), + 'label': 'DCTCP ECN threshold %d KB' % (thresh), + 'sysctl': ['.net.ipv4.tcp_congestion_control', 'dctcp'], + 'ecn_threshold': thresh, + 'value': thresh + }) +elif options.config == 'dynamic_windows': + # Use Homa's new dynamic windows and vary max_incoming + for max in [200, 300, 400, 500, 1000]: + specs.append({'exp_name': 'dwin_%dk' % (max), + 'label': 'max_incoming %d KB' % (max), + 'sysctl': ['.net.homa.dynamic_windows', 1, + '.net.homa.max_incoming', max*1000], + 'value': max + }) +elif options.config == 'link_util': + # Vary link utilization + for gbps in [5, 10, 12, 14, 16, 18, 20]: + specs.append({'exp_name': 'util_%d' % (gbps), + 'label': 'Link utilization %d Gbps' % (gbps), + 'options': ['gbps', gbps/2.0], + 'value': gbps + }) +elif options.config == 'nodes': + # Vary the number of nodes + if not options.workload: + load_info = [["w3", 14, 10], ["w4", 20, 20], ["w5", 20, 30]] + for nodes in [5, 10, 15, 20, 25, 30]: + if nodes > options.num_nodes: + print("Can't run experiment with %d nodes: --nodes is only %d" + % (nodes, options.num_nodes)) + break + specs.append({'exp_name': 'nodes_%d' % (nodes), + 'label': '%d nodes' % (nodes), + 'options': ['num_nodes', nodes], + 'value': nodes + }) +elif options.config == 'overcommit': + # Vary unsched_bytes + if not options.workload: + load_info = [["w3", 14, 10], ["w4", 20, 20], ["w5", 20, 30]] + for over in [2, 3, 4, 5, 6, 7, 8]: + specs.append({'exp_name': 'over_%d' % (over), + 'label': 'Overcommit %d' % (over), + 'sysctl': ['.net.homa.max_overcommit', over], + 'value': over + }) +elif options.config == 'tcp_link_util': + # Measure TCP while varying link utilization + options.protocol = "tcp" + for gbps in [5, 10, 12, 14, 16, 18, 20]: + specs.append({'exp_name': 'tcp_util_%d' % (gbps), + 'label': 'TCP link utilization %d Gbps' % (gbps), + 'options': ['gbps', gbps/2.0], + 'sysctl': ['.net.ipv4.tcp_congestion_control', 'cubic'], + 'value': gbps + }) +elif options.config == 'tcp_nodes': + # Meausre TCP while varying the number of nodes + if not options.workload: + load_info = [["w3", 14, 10], ["w4", 20, 20], ["w5", 20, 30]] + options.protocol = "tcp" + for nodes in [5, 10, 15, 20, 25, 30]: + if nodes > options.num_nodes: + print("Can't run experiment with %d nodes: --nodes is only %d" + % (nodes, options.num_nodes)) + continue + specs.append({'exp_name': 'tcp_nodes_%d' % (nodes), + 'label': '%d nodes' % (nodes), + 'options': ['num_nodes', nodes], + 'sysctl': ['.net.ipv4.tcp_congestion_control', 'cubic'], + 'value': nodes + }) +elif options.config == 'unsched_bytes': + # Vary unsched_bytes + for unsched in [40, 60, 80, 100, 120]: + specs.append({'exp_name': 'unsched_%dk' % (unsched), + 'label': 'unsched_bytes %dk' % (unsched), + 'sysctl': ['.net.homa.unsched_bytes', unsched*1000], + 'value': unsched + }) + +# A list with one value per entry in specs. Each value is a dictionary +# with several fields describing various results for that spec. +buf_limits = [] + +# Keys are parameter names, values are old values to restore. +old_values = {} +if not options.plot_only: + try: + switch = Switch() + + # For each workload, run a set of experiments with a different + # configurations. + for workload, bw, seconds in load_info: + o = copy.deepcopy(options) + o.protocol = "homa" + o.workload = workload + o.gbps = bw/2.0 + o.seconds = seconds + o.client_ports = 1 + o.client_max = 1 + o.server_ports = 1 + o.servers = options.nodes[1:2] + o.unloaded = 500 + exp_name = "unloaded_" + workload + start_servers(o.servers, o) + run_experiment(exp_name, o.clients[0:1], o) + set_unloaded(exp_name) + + for spec in specs: + o = copy.deepcopy(options) + o.workload = workload + o.gbps = bw/2.0 + o.seconds = seconds + exp_name = "%s_%s" % (spec['exp_name'], workload) + if 'sysctl' in spec: + for i in range(0, len(spec['sysctl']), 2): + name = spec['sysctl'][i] + value = spec['sysctl'][i+1] + if name not in old_values: + old_values[name] = get_sysctl_parameter(name) + log("Setting %s = %s" % (name, value)) + set_sysctl_parameter(name, value, + range(0, options.num_nodes)) + if 'options' in spec: + for i in range(0, len(spec['options']), 2): + name = spec['options'][i] + value = spec['options'][i+1] + setattr(o, name, value) + if 'ecn_threshold' in spec: + log("Setting switch ECN marking thresholds to %d" + % (spec['ecn_threshold'])) + switch.set_all_ecn_thresholds(spec['ecn_threshold']) + o.servers = options.nodes[0:o.num_nodes] + o.clients = options.nodes[0:o.num_nodes] + + # Total downlink bandwidth across all nodes, in MBytes/sec + mBps = float(get_sysctl_parameter(".net.homa.link_mbps")) \ + * o.num_nodes/8 + start_servers(o.servers, o) + + # Run the experiment once with no buffer restrictions to get + # a baseline slowdown and buffer utilization. + switch.clear_max_buffer_usage() + switch.set_buffer_limit(13.2) + run_experiment(exp_name, o.clients, o) + digest = get_digest(exp_name) + base_slowdown = digest["avg_slowdown"] + base_buf = switch.get_max_buffer_usage() + log("Baseline for %s: slowdown %.1f, buffer space %.1f MB" + % (exp_name, base_slowdown, base_buf)) + if base_buf < 1.0: + log("Baseline buffer space for %s is only %.3f MB; " + "no need to find minimum." % (exp_name, base_buf)) + buf_limits.append({ + "name": exp_name, + "workload": workload, + "value": spec["value"], + "slowdown": base_slowdown, + "base_buf": base_buf, + "lower_limit": 0, + "base_usecs": base_buf * 1e06 / mBps, + "min_usecs": 0 + }) + continue + + # Now restrict the buffer size, using a series of experiments + # with binary search to find the buffer size at which average + # slowdown increases by 10% from the baseline + max_buf = base_buf + min_buf = 0.0 + count = 0 + while ((max_buf - min_buf) > .1*max_buf) and (max_buf > 0.1): + buf = (max_buf + min_buf)/2.0 + switch.set_buffer_limit(buf) + log("Trying %.3f MB buffer limit (max %.3f MB, min %.3f MB)" + % (buf, max_buf, min_buf)) + count += 1 + exp = "%s_%d" % (exp_name, count) + run_experiment(exp, o.clients, o) + slowdown = get_digest(exp)["avg_slowdown"] + log("Slowdown for %s with %.3f MB buffer limit: %.1f" + % (exp, buf, slowdown)) + if (slowdown < base_slowdown): + slowdown = base_slowdown + if slowdown > 1.1*base_slowdown: + min_buf = buf + else: + max_buf = buf + lower_limit = (max_buf + min_buf)/2.0 + log("Minimum buffer space for %s: %.3f MB" % (exp_name, + lower_limit)) + buf_limits.append({ + "name": exp_name, + "workload": workload, + "value": spec["value"], + "slowdown": base_slowdown, + "base_buf": base_buf, + "lower_limit": lower_limit, + "base_usecs": base_buf * 1e06 / mBps, + "min_usecs": lower_limit * 1e06/ mBps + }) + log("Resetting switch buffer limit") + switch.set_buffer_limit(13.2) + if options.config == 'dctcp_threshold': + log("Resetting switch ECN marking thresholds") + switch.set_all_ecn_thresholds(78) + switch.close() + + except Exception as e: + log(traceback.format_exc()) + + f = open("%s/reports/%s" % (options.log_dir, options.report), "w") + print("# Buffer usage statistics gathered by cp_config_buf at %s" + % date_time, file=f) + for name in sorted(dir(options)): + if name.startswith("_"): + continue + print("# options.%s: %*s %s" % (name, 20 - len(name), "", + getattr(options, name)), file=f) + log("\nSwitch Buffer Utilization:") + print("", file=f) + print("# exp: Name of the experiment", file=f) + print("# prot: Transport protocol", file=f) + print("# workload: w1-w5", file=f) + print("# value: Value of configuration option that is varied", file=f) + print("# slowdown: Average slowdown across all RPCs when no restrictions", file=f) + print("# base_mb: Buffer space used when no restrictions (MBytes)", file=f) + print("# min_mb: Buffer space when slowdown degrades by 10% (MBytes)", file=f) + print("# base_us: Buffer space used when no restriction (microseconds)", file=f) + print("# min_us: Buffer space when slowdown degrades by 10% (microseconds)", file=f) + log("Experiment Slowdown Max MB Min MB Max us Min us") + print("%-24s prot workload value slowdown base_mb min_mb " + "base_us min_us" % ("exp"), file = f) + for exp in buf_limits: + log("%-24s %6.1f %6.2f %6.2f %6.1f %6.1f" % ( + exp["name"], exp["slowdown"], exp["base_buf"], + exp["lower_limit"], exp["base_usecs"], exp["min_usecs"])) + print("%-24s %6s %6s %6s %6.1f %6.2f %6.2f %6.1f %6.1f" % ( + exp["name"], options.protocol, exp["workload"], exp["value"], + exp["slowdown"], exp["base_buf"], exp["lower_limit"], + exp["base_usecs"], exp["min_usecs"]), + file=f) + log("") + f.close() + + for name, value in old_values.items(): + log("Restoring %s to %s" % (name, value)) + set_sysctl_parameter(name, value, range(0, options.num_nodes)) + log("Stopping nodes") + stop_nodes() + scan_logs() + +# Generate plots and reports +for workload, bw, seconds in load_info: + set_unloaded("unloaded_" + workload) + + # Generate slowdown plot. + log("Generating slowdown plot for %s" % (workload)) + title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), + options.num_nodes, bw) + ax = start_plot_vs_msg_length(title, plot_max_y, "%s_%s" % ( + specs[0]['exp_name'], workload), y_label=" Slowdown") + for spec in specs: + exp_name = "%s_%s" % (spec['exp_name'], workload) + plot_slowdown(ax, exp_name, "p99", spec['label'] + ' P99') + for spec in specs: + exp_name = "%s_%s" % (spec['exp_name'], workload) + plot_slowdown(ax, exp_name, "p50", spec['label'] + ' P50') + if workload == "w5": + ax.legend(loc="upper right", prop={'size': 9}) + else: + ax.legend(loc="upper left", prop={'size': 9}) + plt.tight_layout() + plt.savefig("%s/reports/%s_%s.pdf" % + (options.log_dir, options.config, workload)) + + # Generate CDF of small-message RTTs. + log("Generating short message CDFs for %s" % (workload)) + title = "%s %d nodes" % (workload.capitalize(), options.num_nodes) + start_cdf_plot(title, 10, 0.99e05, 1e-05, "RTT (usecs)", + "Cumulative Fraction of Short Messages") + for spec in specs: + exp_name = "%s_%s" % (spec['exp_name'], workload) + x, y = get_short_cdf(exp_name) + plt.plot(x, y, label=spec['label']) + x, y = get_short_cdf("unloaded_" + workload) + plt.plot(x, y, label="Homa best case") + + plt.legend(loc="upper right", prop={'size': 9}) + plt.savefig("%s/reports/%s_%s_cdfs.pdf" % + (options.log_dir, options.config, workload)) diff --git a/util/cp_load b/util/cp_load index 597e53b4..1517be71 100755 --- a/util/cp_load +++ b/util/cp_load @@ -1,18 +1,7 @@ #!/usr/bin/python3 -# Copyright (c) 2020-2022 Stanford University -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2020-2022 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark generates CDFs of short-message latency for Homa # and TCP under different loads. @@ -100,7 +89,7 @@ set_unloaded(unloaded_exp) log("Generating slowdown plot for %s" % (options.workload)) title = "%s %d nodes, %.2f GB/s" % (options.workload.capitalize(), options.num_nodes, options.gbps) -ax = start_slowdown_plot(title, 1000, "homa1.0_%s" % (options.workload)) +ax = start_plot_vs_msg_length(title, 1000, "homa1.0_%s" % (options.workload)) if options.dctcp: plot_slowdown(ax, "dctcp%3.1f_%s" % (fractions[0], options.workload), "p99", "DCTCP %4.2f Gbps" % (fractions[0] * options.gbps), diff --git a/util/cp_mtu b/util/cp_mtu index 9b825680..e523c646 100755 --- a/util/cp_mtu +++ b/util/cp_mtu @@ -1,18 +1,7 @@ #!/usr/bin/python3 -# Copyright (c) 2020-2022 Stanford University -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2020-2022 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark generates CDFs of short-message latency for Homa # and TCP under different values for MTU (maximum packet size). diff --git a/util/cp_node.cc b/util/cp_node.cc index 29ca8d0d..f92ae47f 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -1,16 +1,5 @@ -/* Copyright (c) 2019-2023 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2019-2023 Homa Developers + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file contains a program that runs on one node, as part of @@ -53,7 +42,9 @@ #include #include "dist.h" +extern "C" { #include "homa.h" +} #include "homa_receiver.h" #include "test_utils.h" #include "time_trace.h" @@ -65,18 +56,16 @@ using std::string; uint32_t client_max = 1; uint32_t client_port_max = 1; int client_ports = 0; -int first_port = 4000; -int first_server = 1; +int first_port = -1; bool is_server = false; -int id = -1; +int node_id = -1; double net_gbps = 0.0; bool tcp_trunc = true; bool one_way = false; int port_receivers = 1; int port_threads = 1; std::string protocol_string; -const char *protocol; -int server_nodes = 1; +const char *protocol = "homa"; int server_ports = 1; bool verbose = false; std::string workload_string; @@ -86,6 +75,10 @@ bool client_iovec = false; bool server_iovec = false; int inet_family = AF_INET; int server_core = -1; +int buf_bpages = 1000; + +/* Node ids for client to send requests to. */ +std::vector server_ids; /** @rand_gen: random number generator. */ static std::mt19937 rand_gen( @@ -131,96 +124,82 @@ struct conn_id { } }; -/** - * @server_addrs: Internet addresses for each of the server threads available - * to receive a Homa RPC. - */ -std::vector server_addrs; - -/** - * @server_ids: for each entry in @server_addrs, a connection identifier - * with all fields filled in except client_port, which will be 0. - */ -std::vector server_ids; - -/** - * @freeze: one entry for each node index; 1 means messages to that - * node should contain a flag telling the node to freeze its time trace. - */ -std::vector freeze; +/** @message_id: used to generate unique identifiers for outgoing messages.*/ +std::atomic message_id; /** - * @first_id: entry i contains the index in server_addrs of the first - * entry for the server ports on node i. Used to map from node+port to - * server id. + * @experiments: names of all known experiments (may include some that are + * no longer in use) */ -std::vector first_id; - -/** @message_id: used to generate unique identifiers for outgoing messages.*/ -std::atomic message_id; +std::vector experiments; /** * @last_stats_time: time (in rdtsc cycles) when we last printed - * staticsics. Zero means that none of the statistics below are valid. + * statistics. Zero means that none of the statistics below are valid. */ uint64_t last_stats_time = 0; /** - * @last_client_rpcs: total number of client RPCS completed by this - * application as of the last time we printed statistics. + * @last_client_rpcs: entries correspond to @experiments; total number of + * client RPCs completed by that experiment as of the last time we printed + * statistics. */ -uint64_t last_client_rpcs = 0; +std::vector last_client_rpcs; /** - * @last_client_bytes_out: total amount of data in request messages for - * client RPCS completed by this application as of the last time we printed - * statistics. + * @last_client_bytes_out: entries correspond to @experiments; total amount + * of data sent in request messages by client RPCs in that experiment as + * of the last time we printed statistics. */ -uint64_t last_client_bytes_out = 0; +std::vector last_client_bytes_out; /** - * @last_client_bytes_in: total amount of data in response messages for - * client RPCS completed by this application as of the last time we printed - * statistics. + * @last_client_bytes_in: entries correspond to @experiments; total + * amount of data received in response messages for client RPCs in that + * experiment as of the last time we printed statistics. */ -uint64_t last_client_bytes_in = 0; +std::vector last_client_bytes_in; /** - * @last_total_elapsed: total amount of elapsed time for all client RPCs - * issued by this application (in units of rdtsc cycles), as of the last - * time we printed statistics. + * @last_total_rtt: entries correspond to @experiments; total amount of + * elapsed time for all client RPCs in that experiment (units of rdtsc cycles) + * as of the last time we printed statistics. */ -uint64_t last_total_rtt = 0; +std::vector last_total_rtt; /** - * @last_lag: total lag across all clients (measured in rdtsc cycles) - * as of the last time we printed statistics. + * @last_lag: entries correspond to @experiments; total lag (measured in rdtsc + * cycles) for all clients in that experiment, as of the last time we printed + * statistics. */ -uint64_t last_lag = 0; +std::vector last_lag; /** - * @last_backups: total # of backed-up sends as of the last time we + * @last_backups: entries correspond to @experiments; total # of backed-up + * sends for client RPCs issued by that experiment as of the last time we * printed statistics. */ -uint64_t last_backups = 0; +std::vector last_backups; /** - * @last_server_rpcs: total number of server RPCS handled by this - * application as of the last time we printed statistics. + * @last_server_rpcs: entries correspond to @experiments; total # of server + * RPCs handled by that experiment as of the last time we printed statistics. */ -uint64_t last_server_rpcs = 0; +std::vector last_server_rpcs; /** - * @last_server_bytes_in: total amount of data in incoming requests handled by - * this application as of the last time we printed statistics. + * @last_server_bytes_in: entries correspond to @experiments; total amount + * of data in incoming requests handled by that experiment as of the last + * time we printed statistics. */ -uint64_t last_server_bytes_in = 0; +std::vector last_server_bytes_in; /** - * @last_server_bytes_out: total amount of data in responses sent by - * this application as of the last time we printed statistics. + * @last_server_bytes_out: entries correspond to @experiments; total amount + * of data in responses sent by that experiment as of the last time we printed + * statistics. */ -uint64_t last_server_bytes_out = 0; +std::vector last_server_bytes_out; /** * @last_per_server_rpcs: server->requests for each individual server, @@ -266,6 +245,16 @@ int kfreeze_count = 0; */ int64_t debug[5]; +/** + * fatal() - Invoked when fatal errors occur: exits the application. + */ +void fatal() +{ + fflush(stdout); + fflush(stderr); + _exit(1); +} + /** * print_help() - Print out usage information for this program. * @name: Name of the program (argv[0]) @@ -279,66 +268,90 @@ void print_help(const char *name) "lines from standard input and executing them as commands. The following\n" "commands are supported, each followed by a list of options supported\n" "by that command:\n\n" - "client [options] Start one or more client threads\n" - " --client-max Maximum number of outstanding requests from a single\n" + "client [options] Start one or more client threads\n"); + printf(" --buf-bpages Number of bpages to allocate in the buffer poool for\n" + " incoming messages (default: %d)\n", + buf_bpages); + printf(" --client-max Maximum number of outstanding requests from a single\n" " client machine (divided equally among client ports)\n" - " (default: %d)\n" - " --first-port Lowest port number to use for each server (default: %d)\n" - " --first-server Id of first server node (default: %d, meaning node%d)\n" - " --gbps Target network utilization, including only message data,\n" - " Gbps; 0 means send continuously (default: %.1f)\n" - " --id Id of this node; a value of I >= 0 means requests will\n" - " not be sent to nodeI (default: -1)\n" - " --iovec Use homa_sendv instead of homa_send\n" - " --ipv6 Use IPv6 instead of IPv4\n" - " --no-trunc For TCP, allow messages longer than Homa's limit\n" - " --one-way Make all response messages 100 B, instead of the same\n"\ - " size as request messages\n" - " --ports Number of ports on which to send requests (one\n" - " sending thread per port (default: %d)\n" - " --port-receivers Number of threads to listen for responses on each\n" + " (default: %d)\n", client_max); + printf(" --exp Name of the experiment in which these client threads\n"); + printf(" will be participating; used to label measurement data\n"); + printf(" (defaults to _)\n"); + printf(" --first-port Lowest port number to use for each server (default: \n"); + printf(" 4000 for Homa, 5000 for TCP)\n"); + printf(" --first-server Id of first server node (default: 1, meaning node1)\n"); + printf(" --gbps Target network utilization, including only message data,\n" + " Gbps; 0 means send continuously (default: %.1f)\n", + net_gbps); + printf(" --id Id of this node; a value of I >= 0 means requests will\n" + " not be sent to nodeI (default: -1)\n"); + printf(" --ipv6 Use IPv6 instead of IPv4\n"); + printf(" --no-trunc For TCP, allow messages longer than Homa's limit\n"); + printf(" --one-way Make all response messages 100 B, instead of the same\n"\ + " size as request messages\n"); + printf(" --ports Number of ports on which to send requests (one\n" + " sending thread per port (default: %d)\n", + client_ports); + printf(" --port-receivers Number of threads to listen for responses on each\n" " port (default: %d). Zero means senders wait for their\n" - " own requests synchronously\n" - " --protocol Transport protocol to use: homa or tcp (default: %s)\n" - " --server-nodes Number of nodes running server threads (default: %d)\n" - " --server-ports Number of server ports on each server node\n" - " (default: %d)\n" - " --unloaded Nonzero means run test in special mode for collecting\n" + " own requests synchronously\n", + port_receivers); + printf(" --protocol Transport protocol to use: homa or tcp (default: %s)\n", + protocol); + printf(" --server-nodes Number of nodes running server threads (default: 1)\n"); + printf(" --server-ports Number of server ports on each server node\n" + " (default: %d)\n", + server_ports); + printf(" --servers Comma-separated list of integer ids to use as server\n"); + printf(" nodes; if specified, overrides --first-server and\n" + " --server-nodes\n"); + printf(" --unloaded Nonzero means run test in special mode for collecting\n" " baseline data, with the given number of measurements\n" - " per length in the distribution (Homa only, default: 0)\n" - " --workload Name of distribution for request lengths (e.g., 'w1')\n" - " or integer for fixed length (default: %s)\n\n" "debug value value ... Set one or more int64_t values that may be used for\n" - " various debugging purposes\n\n" - "dump_times file Log RTT times (and lengths) to file\n\n" - "exit Exit the application\n\n" - "log [options] [msg] Configure logging as determined by the options. If\n" + " per length in the distribution (Homa only, default: 0)\n"); + printf(" --workload Name of distribution for request lengths (e.g., 'w1')\n" + " or integer for fixed length (default: %s)\n\n", + workload); + printf("debug value value ... Set one or more int64_t values that may be used for\n" + " various debugging purposes\n\n"); + printf("dump_times file [exp] Log RTT times (and lengths) for clients running\n"); + printf(" experiment exp to file; if exp is omitted, dump\n"); + printf(" all RTTs\n\n"); + printf("exit Exit the application\n\n"); + printf("log [options] [msg] Configure logging as determined by the options. If\n" " there is an \"option\" that doesn't start with \"--\",\n" " then it and all of the remaining words are printed to\n" - " the log as a message.\n" - " --file Name of log file to use for future messages (\"-\"\n" - " means use standard output)\n" - " --level Log level: either normal or verbose\n\n" - "server [options] Start serving requests on one or more ports\n" - " --first-port Lowest port number to use (default: %d)\n" - " --iovec Use homa_replyv instead of homa_reply\n" - " --ipv6 Use IPv6 instead of IPv4\n" - " --pin All server threads will be restricted to run only\n" - " on the givevn core\n" - " --protocol Transport protocol to use: homa or tcp (default: %s)\n" - " --port-threads Number of server threads to service each port\n" - " (Homa only, default: %d)\n" - " --ports Number of ports to listen on (default: %d)\n\n" - "stop [options] Stop existing client and/or server threads; each\n" - " option must be either 'clients' or 'servers'\n\n" - " tt [options] Manage time tracing:\n" - " freeze Stop recording time trace information until\n" - " print has been invoked\n" - " kfreeze Freeze the kernel's internal timetrace\n" - " print file Dump timetrace information to file\n", - client_max, first_port, first_server, first_server, net_gbps, - client_ports, port_receivers, protocol, - server_nodes, server_ports, workload, - first_port, protocol, port_threads, server_ports); + " the log as a message.\n"); + printf(" --file Name of log file to use for future messages (\"-\"\n" + " means use standard output)\n"); + printf(" --level Log level: either normal or verbose\n\n"); + printf("server [options] Start serving requests on one or more ports\n"); + printf(" --buf-bpages Number of bpages to allocate in the buffer poool for\n" + " incoming messages (default: %d)\n", + buf_bpages); + printf(" --exp Name of the experiment in which these server ports\n"); + printf(" will be participating; used to label measurement data\n"); + printf(" (defaults to _)\n"); + printf(" --first-port Lowest port number to use (default: 4000 for Homa,\n"); + printf(" 5000 for TCP)\n"); + printf(" --iovec Use iovecs for reply instead of a single buffer\n"); + printf(" --ipv6 Use IPv6 instead of IPv4\n"); + printf(" --pin All server threads will be restricted to run only\n" + " on the givevn core\n"); + printf(" --protocol Transport protocol to use: homa or tcp (default: %s)\n", + protocol); + printf(" --port-threads Number of server threads to service each port\n" + " (default: %d)\n", + port_threads); + printf(" --ports Number of ports to listen on (default: %d)\n\n", + server_ports); + printf("stop [options] Stop existing client and/or server threads; each\n" + " option must be either 'clients' or 'servers'\n\n"); + printf(" tt [options] Manage time tracing:\n"); + printf(" freeze Stop recording time trace information until\n" + " print has been invoked\n"); + printf(" kfreeze Freeze the kernel's internal timetrace\n"); + printf(" print file Dump timetrace information to file\n"); } /** @@ -441,6 +454,7 @@ void log_affinity() */ void kfreeze() { +#ifndef __STRIP__ /* See strip.py */ kfreeze_count++; if (kfreeze_count > 1) return; @@ -454,6 +468,7 @@ void kfreeze() log(NORMAL, "ERROR: HOMAIOCFREEZE ioctl failed: %s\n", strerror(errno)); close(fd); +#endif /* See strip.py */ } /** @@ -467,7 +482,20 @@ struct message_header { * @length: total number of bytes in the message, including this * header. */ - int length:30; + int32_t length; + + /** + * @cid: uniquely identifies the connection between a client + * and a server. + */ + conn_id cid; + + /** + * @msg_id: created by client, returned by server so client can + * match responses to requests; distinguishes among concurrent + * outstanding requests from a client. Not unique across all time. + */ + uint16_t msg_id; /** @freeze: true means the recipient should freeze its time trace. */ unsigned int freeze:1; @@ -479,68 +507,14 @@ struct message_header { unsigned int short_response:1; /** - * @cid: uniquely identifies the connection between a client - * and a server. + * @response: nonzero means this is a response messages, zero means + * request */ - conn_id cid; + unsigned int response:1; - /** - * @msg_id: unique identifier for this message among all those - * from a given client machine. - */ - uint32_t msg_id; + unsigned int reserved:13; }; -/** - * init_server_addrs() - Set up the server_addrs table (addresses of the - * server/port combinations that clients will communicate with), based on - * current configuration parameters. Any previous contents of the table - * are discarded. This also initializes related arrays @server_ids and - * @freeze. - */ -void init_server_addrs(void) -{ - server_addrs.clear(); - server_ids.clear(); - freeze.clear(); - first_id.clear(); - for (int node = first_server; node < first_server + server_nodes; - node++) { - char host[100]; - struct addrinfo hints; - struct addrinfo *matching_addresses; - sockaddr_in_union *dest; - - if (node == id) - continue; - snprintf(host, sizeof(host), "node%d", node); - memset(&hints, 0, sizeof(struct addrinfo)); - hints.ai_family = inet_family; - hints.ai_socktype = SOCK_DGRAM; - int status = getaddrinfo(host, NULL, &hints, - &matching_addresses); - if (status != 0) { - log(NORMAL, "FATAL: couldn't look up address " - "for %s: %s\n", - host, gai_strerror(status)); - exit(1); - } - dest = reinterpret_cast - (matching_addresses->ai_addr); - while (((int) first_id.size()) < node) - first_id.push_back(-1); - first_id.push_back((int) server_addrs.size()); - for (int thread = 0; thread < server_ports; thread++) { - dest->in4.sin_port = htons(first_port + thread); - server_addrs.push_back(*dest); - server_ids.emplace_back(node, thread, id, 0); - } - while (((int) freeze.size()) <= node) - freeze.push_back(0); - freeaddrinfo(matching_addresses); - } -} - /** * class spin_lock - Implements simple spin lock guards: lock is acquired by * constructor, released by destructor. @@ -710,9 +684,8 @@ int tcp_connection::read(bool loop, && (errno == ECONNRESET))) { /* Connection was closed by the client. */ snprintf(error_message, sizeof(error_message), - "TCP connection on port %d " - "(fd %d) closed by peer %s", - port, fd, print_address(&peer)); + "TCP connection on port %d (fd %d) closed by peer %s", + port, fd, print_address(&peer)); return 1; } @@ -739,6 +712,19 @@ int tcp_connection::read(bool loop, } + if ((count >= 4) && (strncmp(buffer, "GET ", 4) == 0)) { + /* It looks like someone is trying to make an HTTP + * connection to us; that's bogus. + */ + log(NORMAL, "ERROR: unexpected data received from " + "%s: %.*s\n", print_address(&peer), + count, buffer); + snprintf(error_message, sizeof(error_message), + "Unexpected data received from %s", + print_address(&peer)); + return 1; + } + /* * Process incoming bytes (could contains parts of multiple * requests). The first 4 bytes of each request give its @@ -770,8 +756,22 @@ int tcp_connection::read(bool loop, } } - /* At this point we know the request length, so read until - * we've got a full request. + if ((header.length > HOMA_MAX_MESSAGE_LENGTH) + || (header.length < 0)) { + log(NORMAL, "ERROR: invalid message length %d " + "from %s, closing connection\n", + header.length, + print_address(&peer)); + snprintf(error_message, sizeof(error_message), + "Invalid message length %d " + "from %s", + header.length, + print_address(&peer)); + return 1; + } + + /* At this point we know the request length, so read + * until we've got a full request. */ int needed = header.length - bytes_received; if (count < needed) { @@ -813,7 +813,7 @@ void tcp_connection::set_epoll_events(int epoll_fd, uint32_t events) : EPOLL_CTL_MOD, fd, &ev) < 0) { log(NORMAL, "FATAL: couldn't add/modify epoll event: %s\n", strerror(errno)); - exit(1); + fatal(); } epoll_events = events; } @@ -881,7 +881,7 @@ bool tcp_connection::xmit() "to %s: %s (port %d)\n", print_address(&peer), strerror(errno), port); - exit(1); + fatal(); } } if (bytes_sent < header->length) { @@ -905,6 +905,9 @@ bool tcp_connection::xmit() */ class server_metrics { public: + /** @experiment: Name of experiment for this server thread */ + std::string experiment; + /** @requests: Total number of requests handled so far. */ uint64_t requests; @@ -920,12 +923,14 @@ class server_metrics { */ uint64_t bytes_out; - server_metrics() :requests(0), bytes_in(0), bytes_out(0) {} + server_metrics(std::string& experiment) : experiment(experiment), + requests(0), bytes_in(0), bytes_out(0) {} }; /** * @metrics: keeps track of metrics for all servers (whether Homa or TCP). - * These are malloc-ed and must be freed eventually. + * These are malloc-ed and must be freed eventually. This is a pointer so + * that it doesn't get destructed */ std::vector metrics; @@ -936,7 +941,8 @@ std::vector metrics; */ class homa_server { public: - homa_server(int port, int id, int inet_family, int num_threads); + homa_server(int port, int id, int inet_family, int num_threads, + std::string& experiment); ~homa_server(); void server(int thread_id, server_metrics *metrics); @@ -949,6 +955,9 @@ class homa_server { /** @port: Homa port number managed by this object. */ int port; + /** @experiment: name of the experiment this server is running. */ + string experiment; + /** * @buf_region: mmapped region of memory in which receive buffers * are alloocated. @@ -974,25 +983,31 @@ std::vector homa_servers; * @inet_family: AF_INET or AF_INET6: determines whether we use IPv4 or IPv6. * @num_threads: How many threads should collctively service requests on * @port. + * @experiment: Name of the experiment in which this server is participating. */ -homa_server::homa_server(int port, int id, int inet_family, - int num_threads) +homa_server::homa_server(int port, int id, int inet_family, int num_threads, + std::string& experiment) : id(id) , fd(-1) , port(port) + , experiment(experiment) , buf_region(NULL) , buf_size(0) , threads() { sockaddr_in_union addr; - struct homa_set_buf_args arg; + struct homa_rcvbuf_args arg; + + if (std::find(experiments.begin(), experiments.end(), experiment) + == experiments.end()) + experiments.emplace_back(experiment); fd = socket(inet_family, SOCK_DGRAM, IPPROTO_HOMA); if (fd < 0) { log(NORMAL, "FATAL: homa_server couldn't open Homa " "socket: %s\n", strerror(errno)); - exit(1); + fatal(); } memset(&addr, 0, sizeof(addr)); @@ -1007,30 +1022,30 @@ homa_server::homa_server(int port, int id, int inet_family, log(NORMAL, "FATAL: homa_server couldn't bind socket " "to Homa port %d: %s\n", port, strerror(errno)); - exit(1); + fatal(); } log(NORMAL, "Successfully bound to Homa port %d\n", port); - buf_size = 1000*HOMA_BPAGE_SIZE; + buf_size = buf_bpages*HOMA_BPAGE_SIZE; buf_region = (char *) mmap(NULL, buf_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); if (buf_region == MAP_FAILED) { printf("Couldn't mmap buffer region for server on port %d: %s\n", port, strerror(errno)); - exit(1); + fatal(); } - arg.start = buf_region; + arg.start = (uintptr_t)buf_region; arg.length = buf_size; - int status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_SET_BUF, &arg, + int status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_RCVBUF, &arg, sizeof(arg)); if (status < 0) { - printf("FATAL: error in setsockopt(SO_HOMA_SET_BUF): %s\n", + printf("FATAL: error in setsockopt(SO_HOMA_RCVBUF): %s\n", strerror(errno)); - exit(1); + fatal(); } for (int i = 0; i < num_threads; i++) { - server_metrics *thread_metrics = new server_metrics; + server_metrics *thread_metrics = new server_metrics(experiment); metrics.push_back(thread_metrics); threads.emplace_back([this, i, thread_metrics] () { server(i, thread_metrics); @@ -1043,6 +1058,7 @@ homa_server::homa_server(int port, int id, int inet_family, */ homa_server::~homa_server() { + log(NORMAL, "Homa server on port %d shutting down\n", port); shutdown(fd, SHUT_RDWR); for (std::thread &thread: threads) thread.join(); @@ -1058,11 +1074,13 @@ homa_server::~homa_server() */ void homa_server::server(int thread_id, server_metrics *metrics) { - message_header *header; - int length, num_vecs, result; - char thread_name[50]; homa::receiver receiver(fd, buf_region); struct iovec vecs[HOMA_MAX_BPAGES]; + struct homa_sendmsg_args homa_args; + int length, num_vecs, result; + message_header *header; + struct msghdr msghdr; + char thread_name[50]; int offset; snprintf(thread_name, sizeof(thread_name), "S%d.%d", id, thread_id); @@ -1075,11 +1093,15 @@ void homa_server::server(int thread_id, server_metrics *metrics) while (1) { while (1) { - length = receiver.receive(HOMA_RECVMSG_REQUEST, 0); + length = receiver.receive(0, 0); if (length >= 0) break; - if ((errno == EBADF) || (errno == ESHUTDOWN)) + if ((errno == EBADF) || (errno == ESHUTDOWN)) { + log(NORMAL, "Homa server thread %s exiting " + "(socket closed)\n", + thread_name); return; + } else if ((errno != EINTR) && (errno != EAGAIN)) log(NORMAL, "recvmsg failed: %s\n", strerror(errno)); @@ -1098,6 +1120,7 @@ void homa_server::server(int thread_id, server_metrics *metrics) if ((header->short_response) && (header->length > 100)) { header->length = 100; } + header->response = 1; num_vecs = 0; offset = 0; @@ -1110,13 +1133,16 @@ void homa_server::server(int thread_id, server_metrics *metrics) offset += chunk_size; num_vecs++; } - result = homa_replyv(fd, vecs, num_vecs, receiver.src_addr(), - receiver.id()); + init_sendmsg_hdrs(&msghdr, &homa_args, vecs, num_vecs, + receiver.src_addr(), + sockaddr_size(receiver.src_addr())); + homa_args.id = receiver.id(); + result = sendmsg(fd, &msghdr, 0); if (result < 0) { - log(NORMAL, "FATAL: homa_reply failed for server " + log(NORMAL, "FATAL: sendmsg failed for server " "port %d: %s\n", port, strerror(errno)); - exit(1); + fatal(); } metrics->requests++; metrics->bytes_in += length; @@ -1130,7 +1156,7 @@ void homa_server::server(int thread_id, server_metrics *metrics) */ class tcp_server { public: - tcp_server(int port, int id, int num_threads); + tcp_server(int port, int id, int num_threads, std::string& experiment); ~tcp_server(); void accept(int epoll_fd); void read(int fd, int pid); @@ -1148,6 +1174,9 @@ class tcp_server { /** @id: Unique identifier for this server. */ int id; + /** @experiment: name of the experiment this server is running. */ + string experiment; + /** @listen_fd: File descriptor for the listen socket. */ int listen_fd; @@ -1190,9 +1219,11 @@ std::vector tcp_servers; * requests. * @id: Unique identifier for this server. * @num_threads: Number of threads to service this listening socket and - * all of the other sockets excepted from it. + * all of the other sockets accepted from it. + * @experiment: Name of the experiment in which this server is participating. */ -tcp_server::tcp_server(int port, int id, int num_threads) +tcp_server::tcp_server(int port, int id, int num_threads, + std::string& experiment) : mutex(0) , port(port) , id(id) @@ -1204,12 +1235,16 @@ tcp_server::tcp_server(int port, int id, int num_threads) , threads() , stop(false) { + if (std::find(experiments.begin(), experiments.end(), experiment) + == experiments.end()) + experiments.emplace_back(experiment); + memset(connections, 0, sizeof(connections)); listen_fd = socket(inet_family, SOCK_STREAM, 0); if (listen_fd == -1) { log(NORMAL, "FATAL: couldn't open server socket: %s\n", strerror(errno)); - exit(1); + fatal(); } int option_value = 1; if (setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &option_value, @@ -1217,13 +1252,13 @@ tcp_server::tcp_server(int port, int id, int num_threads) log(NORMAL, "FATAL: couldn't set SO_REUSEADDR on listen " "socket: %s", strerror(errno)); - exit(1); + fatal(); } if (fcntl(listen_fd, F_SETFL, O_NONBLOCK) != 0) { log(NORMAL, "FATAL: couldn't set O_NONBLOCK on listen " "socket: %s", strerror(errno)); - exit(1); + fatal(); } sockaddr_in_union addr; if (inet_family == AF_INET) { @@ -1238,12 +1273,12 @@ tcp_server::tcp_server(int port, int id, int num_threads) if (bind(listen_fd, &addr.sa, sizeof(addr)) == -1) { log(NORMAL, "FATAL: couldn't bind to port %d: %s\n", port, strerror(errno)); - exit(1); + fatal(); } if (listen(listen_fd, 1000) == -1) { log(NORMAL, "FATAL: couldn't listen on socket: %s", strerror(errno)); - exit(1); + fatal(); } epoll_fd = epoll_create(10); @@ -1251,7 +1286,7 @@ tcp_server::tcp_server(int port, int id, int num_threads) log(NORMAL, "FATAL: couldn't create epoll instance for " "TCP server: %s\n", strerror(errno)); - exit(1); + fatal(); } struct epoll_event ev; ev.events = EPOLLIN; @@ -1259,10 +1294,10 @@ tcp_server::tcp_server(int port, int id, int num_threads) if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, listen_fd, &ev) < 0) { log(NORMAL, "FATAL: couldn't add listen socket to epoll: %s\n", strerror(errno)); - exit(1); + fatal(); } - metrics = new server_metrics; + metrics = new server_metrics(experiment); ::metrics.push_back(metrics); for (int i = 0; i < num_threads; i++) @@ -1286,7 +1321,7 @@ tcp_server::~tcp_server() if (pipe2(fds, 0) < 0) { log(NORMAL, "FATAL: couldn't create pipe to shutdown TCP " "server: %s\n", strerror(errno)); - exit(1); + fatal(); } struct epoll_event ev; ev.events = EPOLLIN; @@ -1295,7 +1330,7 @@ tcp_server::~tcp_server() if (write(fds[1], "xxxx", 4) < 0) { log(NORMAL, "FATAL: couldn't write to TCP shutdown pipe: %s\n", strerror(errno)); - exit(1); + fatal(); } for (size_t i = 0; i < threads.size(); i++) @@ -1357,7 +1392,7 @@ void tcp_server::server(int thread_id) continue; log(NORMAL, "FATAL: epoll_wait failed: %s\n", strerror(errno)); - exit(1); + fatal(); } tt("epoll_wait returned %d events in server pid %d", num_events, pid); @@ -1381,6 +1416,7 @@ void tcp_server::server(int thread_id) } } } + log(NORMAL, "TCP server thread %s exiting\n", thread_name); } /** @@ -1400,7 +1436,34 @@ void tcp_server::accept(int epoll_fd) return; log(NORMAL, "FATAL: couldn't accept incoming TCP connection: " "%s\n", strerror(errno)); - exit(1); + fatal(); + } + + /* Make sure the connection appears to be coming from someone + * we trust (as of August 2023, at CloudLab, external sites + * could open connections). + */ + if (client_addr.in4.sin_family == AF_INET) { + uint8_t *ipaddr = (uint8_t *) &client_addr.in4.sin_addr; + bool is_internet = true; + + if (ipaddr[0] == 10) { + is_internet = false; + } + else if (ipaddr[0] == 172 && (ipaddr[1] >= 16 && ipaddr[1] <= 31)) { + is_internet = false; + } + else if (ipaddr[0] == 192 && ipaddr[1] == 168) { + is_internet = false; + } + + if (is_internet) { + log(NORMAL, "ERROR: tcp_server::accept rejecting " + "rogue TCP connection from %s\n", + print_address(&client_addr)); + ::close(fd); + return; + } } log(NORMAL, "tcp_server on port %d accepted connection from %s, fd %d\n", port, print_address(&client_addr), fd); @@ -1409,7 +1472,7 @@ void tcp_server::accept(int epoll_fd) if (fd >= MAX_FDS) { log(NORMAL, "FATAL: TCP socket fd %d is greater than MAX_FDS\n", fd); - exit(1); + fatal(); } spin_lock lock_guard(&fd_locks[fd]); tcp_connection *connection = new tcp_connection(fd, fd, port, @@ -1444,6 +1507,7 @@ void tcp_server::read(int fd, int pid) } if ((header->short_response) && (header->length > 100)) header->length = 100; + header->response = 1; metrics->bytes_out += header->length; if (!connections[fd]->send_message(header)) connections[fd]->set_epoll_events(epoll_fd, @@ -1486,10 +1550,15 @@ class client { */ bool active; + /** + * @id: RPC identifier for the request (only for Homa requests). + */ + uint64_t id; + rinfo() : start_time(0), request_length(0), active(false) {} }; - client(int id); + client(int id, std::string& experiment); virtual ~client(); void check_completion(const char *protocol); int get_rinfo(); @@ -1502,8 +1571,33 @@ class client { */ int id; - /** @num_servers: Number of servers this client will send requests to. */ - size_t num_servers; + /** @experiment: name of the experiment this client is running. */ + string experiment; + + /** + * @server_addrs: Internet addresses for each of the server ports + * where this client will send RPCs. + */ + std::vector server_addrs; + + /** + * @server_conns: for each entry in @server_addrs, a connection + * identifier with all fields filled in except client_port, which + * will be 0. + */ + std::vector server_conns; + + /** + * @freeze: one entry for each node index; 1 means messages to that + * node should contain a flag telling the node to freeze its time trace. + */ + std::vector freeze; + + /** + * @first_id: entry i contains the index in server_addrs of the first + * entry for the server ports on node i. + */ + std::vector first_id; /** * @rinfos: storage for more than enough rinfos to handle all of the @@ -1606,17 +1700,23 @@ class client { std::vector clients; /** - * client::client() - Constructor for client objects. + * client::client() - Constructor for client objects. Uses configuration + * information from global variables to initialize. * - * @id: Unique identifier for this client (index starting at 0?) + * @id: Unique identifier for this client (index starting at 0?) + * @experiment: Name of experiment in which this client will participate. */ -client::client(int id) +client::client(int id, std::string& experiment) : id(id) - , num_servers(server_addrs.size()) + , experiment(experiment) + , server_addrs() + , server_conns() + , freeze() + , first_id() , last_rinfo(0) , receivers_running(0) , cycles_per_second(get_cycles_per_sec()) - , server_dist(0, static_cast(num_servers - 1)) + , server_dist() , length_dist(workload, HOMA_MAX_MESSAGE_LENGTH) , actual_lengths(NUM_CLIENT_STATS, 0) , actual_rtts(NUM_CLIENT_STATS, 0) @@ -1627,13 +1727,59 @@ client::client(int id) , total_rtt(0) , lag(0) { + if (std::find(experiments.begin(), experiments.end(), experiment) + == experiments.end()) + experiments.emplace_back(experiment); + + server_addrs.clear(); + server_conns.clear(); + freeze.clear(); + first_id.clear(); + for (int node: server_ids) { + char host[100]; + struct addrinfo hints; + struct addrinfo *matching_addresses; + sockaddr_in_union *dest; + + if (node == node_id) + continue; + snprintf(host, sizeof(host), "node%d", node); + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_family = inet_family; + hints.ai_socktype = SOCK_DGRAM; + int status = getaddrinfo(host, NULL, &hints, + &matching_addresses); + if (status != 0) { + log(NORMAL, "FATAL: couldn't look up address " + "for %s: %s\n", + host, gai_strerror(status)); + fatal(); + } + dest = reinterpret_cast + (matching_addresses->ai_addr); + while (((int) first_id.size()) < node) + first_id.push_back(-1); + first_id.push_back((int) server_addrs.size()); + for (int thread = 0; thread < server_ports; thread++) { + dest->in4.sin_port = htons(first_port + thread); + server_addrs.push_back(*dest); + server_conns.emplace_back(node, thread, node_id, 0); + } + while (((int) freeze.size()) <= node) + freeze.push_back(0); + freeaddrinfo(matching_addresses); + } + + server_dist.param(std::uniform_int_distribution<>::param_type(0, + static_cast(server_addrs.size() - 1))); + rinfos.resize(2*client_port_max + 5); double avg_length = length_dist.get_mean(); double rate = 1e09*(net_gbps/8.0)/(avg_length*client_ports); interval_dist = std::exponential_distribution(rate); requests.resize(server_addrs.size()); - responses = new std::atomic[num_servers]; - for (size_t i = 0; i < num_servers; i++) + responses = new std::atomic[server_addrs.size()]; + for (size_t i = 0; i < server_addrs.size(); i++) responses[i] = 0; log(NORMAL, "Average message length %.1f KB, rate %.2f K/sec, " "expected BW %.1f Gbps\n", @@ -1651,7 +1797,7 @@ client::~client() /** * check_completion() - Make sure that all outstanding requests have - * completed; if not, generate a log message. + * completed; if not, generate log messages. * @protocol: String that identifies the current protocol for the log * message, if any. */ @@ -1666,7 +1812,9 @@ void client::check_completion(const char *protocol) continue; if (!server_info.empty()) server_info.append(", "); - snprintf(buffer, sizeof(buffer), "s%lu: %d", i, diff); + snprintf(buffer, sizeof(buffer), "node%d.%d: %d", + server_conns[i].server, + server_conns[i].server_port, diff); server_info.append(buffer); } if ((incomplete != 0) || !server_info.empty()) @@ -1697,7 +1845,7 @@ int client::get_rinfo() "total_responses %ld, last_rinfo %d)\n", rinfos.size(), total_requests, total_responses.load(), last_rinfo); - exit(1); + fatal(); } } } @@ -1720,8 +1868,11 @@ void client::record(uint64_t end_time, message_header *header) } rinfo *r = &rinfos[header->msg_id]; if (!r->active) { - log(NORMAL, "ERROR: response arrived for inactive msg_id %u\n", - header->msg_id); + int *int_hdr = reinterpret_cast(header); + log(NORMAL, "ERROR: response arrived for inactive msg_id %u, " + "header 0x%x, 0x%x, 0x%x\n", + header->msg_id, int_hdr[0], int_hdr[1], + int_hdr[2]); return; } rtt = end_time - r->start_time; @@ -1768,7 +1919,7 @@ void client::record(uint64_t end_time, message_header *header) */ class homa_client : public client { public: - homa_client(int id); + homa_client(int id, std::string& experiment); virtual ~homa_client(); void measure_unloaded(int count); uint64_t measure_rtt(int server, int length, char *buffer, @@ -1776,6 +1927,7 @@ class homa_client : public client { void receiver(int id); void sender(void); virtual void stop_sender(void); + void timeout(homa::receiver *receiver); bool wait_response(homa::receiver *receiver, uint64_t rpc_id); /** @fd: file descriptor for Homa socket. */ @@ -1818,13 +1970,14 @@ class homa_client : public client { /** * homa_client::homa_client() - Constructor for homa_client objects. * - * @id: Unique identifier for this client (index starting at 0?) + * @id: Unique identifier for this client (index starting at 0?). + * @experiment: Name of experiment in which this client will participate. */ -homa_client::homa_client(int id) - : client(id) +homa_client::homa_client(int id, std::string& experiment) + : client(id, experiment) , fd(-1) , buf_region(nullptr) - , buf_size(2000*HOMA_BPAGE_SIZE) + , buf_size(buf_bpages*HOMA_BPAGE_SIZE) , exit_sender(false) , exit_receivers(false) , sender_exited(false) @@ -1832,12 +1985,12 @@ homa_client::homa_client(int id) , receiving_threads() , sending_thread() { - struct homa_set_buf_args arg; + struct homa_rcvbuf_args arg; fd = socket(inet_family, SOCK_DGRAM, IPPROTO_HOMA); if (fd < 0) { log(NORMAL, "Couldn't open Homa socket: %s\n", strerror(errno)); - exit(1); + fatal(); } buf_region = (char *) mmap(NULL, buf_size, PROT_READ|PROT_WRITE, @@ -1845,16 +1998,16 @@ homa_client::homa_client(int id) if (buf_region == MAP_FAILED) { printf("Couldn't mmap buffer region for homa_client id %d: %s\n", id, strerror(errno)); - exit(1); + fatal(); } - arg.start = buf_region; + arg.start = (uintptr_t)buf_region; arg.length = buf_size; - int status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_SET_BUF, &arg, + int status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_RCVBUF, &arg, sizeof(arg)); if (status < 0) { - printf("FATAL: error in setsockopt(SO_HOMA_SET_BUF): %s\n", + printf("FATAL: error in setsockopt(SO_HOMA_RCVBUF): %s\n", strerror(errno)); - exit(1); + fatal(); } if (unloaded) { @@ -1934,21 +2087,27 @@ bool homa_client::wait_response(homa::receiver *receiver, uint64_t rpc_id) rpc_id = 0; ssize_t length; do { - length = receiver->receive(HOMA_RECVMSG_RESPONSE, rpc_id); + length = receiver->receive(0, rpc_id); } while ((length < 0) && ((errno == EAGAIN) || (errno == EINTR))); if (length < 0) { if (exit_receivers) return false; - log(NORMAL, "FATAL: error in recvmsg: %s (id %lu, server %s)\n", - strerror(errno), rpc_id, - print_address(receiver->src_addr())); - exit(1); + if (errno == ETIMEDOUT) { + timeout(receiver); + return true; + } + log(NORMAL, "FATAL: error in Homa recvmsg: %s (id %lu, " + "server %s)\n", + strerror(errno), receiver->id(), + print_address((union sockaddr_in_union *) + receiver->src_addr())); + fatal(); } header = receiver->get(0); if (header == nullptr) { - log(NORMAL, "FATAL: response message contained %lu bytes; " + log(NORMAL, "FATAL: Homa response message contained %lu bytes; " "need at least %lu", length, sizeof(*header)); - exit(1); + fatal(); } uint64_t end_time = rdtsc(); tt("Received response, cid 0x%08x, id %x, %d bytes", @@ -1957,6 +2116,29 @@ bool homa_client::wait_response(homa::receiver *receiver, uint64_t rpc_id) return true; } + +/** + * timeout() - Invoked to process Homa timeouts (free up the rinfo struct). + * @receiver: Holds information about the failed RPC. + */ +void homa_client::timeout(homa::receiver *receiver) +{ + uint64_t id = receiver->id(); + for (struct rinfo &r: rinfos) { + if (r.id == id) { + log(NORMAL, "ERROR: Homa RPC timed out, id %lu, " + "length %d, server %s\n", + id, r.request_length, + print_address((union sockaddr_in_union *) + receiver->src_addr())); + r.active = false; + return; + } + } + log(NORMAL, "FATAL: couldn't find rinfo for timed out RPC id %lu\n", id); + fatal(); +} + /** * homa_client::sender() - Invoked as the top-level method in a thread; * invokes a pseudo-random stream of RPCs continuously. @@ -1967,13 +2149,16 @@ void homa_client::sender() uint64_t next_start = rdtsc(); char thread_name[50]; homa::receiver receiver(fd, buf_region); + struct homa_sendmsg_args homa_args; + struct msghdr msghdr; + struct iovec vec[2]; + int num_vecs; snprintf(thread_name, sizeof(thread_name), "C%d", id); time_trace::thread_buffer thread_buffer(thread_name); while (1) { uint64_t now; - uint64_t rpc_id; int server; int status; int slot = get_rinfo(); @@ -2002,30 +2187,37 @@ void homa_client::sender() if (header->length < sizeof32(*header)) header->length = sizeof32(*header); rinfos[slot].request_length = header->length; - header->cid = server_ids[server]; + header->cid = server_conns[server]; header->cid.client_port = id; + header->msg_id = slot; header->freeze = freeze[header->cid.server]; header->short_response = one_way; - header->msg_id = slot; + header->response = 0; tt("sending request, cid 0x%08x, id %u, length %d", header->cid, header->msg_id, header->length); + if (client_iovec && (header->length > 20)) { - struct iovec vec[2]; vec[0].iov_base = sender_buffer; vec[0].iov_len = 20; vec[1].iov_base = sender_buffer + 20; vec[1].iov_len = header->length - 20; - status = homa_sendv(fd, vec, 2, - &server_addrs[server], &rpc_id, 0); - } else - status = homa_send(fd, sender_buffer, header->length, - &server_addrs[server], &rpc_id, 0); + num_vecs = 2; + } else { + vec[0].iov_base = sender_buffer; + vec[0].iov_len = header->length; + num_vecs = 1; + } + init_sendmsg_hdrs(&msghdr, &homa_args, vec, num_vecs, + &server_addrs[server].sa, + sockaddr_size(&server_addrs[server].sa)); + status = sendmsg(fd, &msghdr, 0); if (status < 0) { - log(NORMAL, "FATAL: error in homa_send: %s (request " + log(NORMAL, "FATAL: error in Homa sendmsg: %s (request " "length %d)\n", strerror(errno), header->length); - exit(1); + fatal(); } + rinfos[slot].id = homa_args.id; requests[server]++; total_requests++; lag = now - next_start; @@ -2033,7 +2225,7 @@ void homa_client::sender() if (receivers_running == 0) { /* There isn't a separate receiver thread; wait for * the response here. */ - wait_response(&receiver, rpc_id); + wait_response(&receiver, homa_args.id); } } } @@ -2046,7 +2238,7 @@ void homa_client::sender() void homa_client::receiver(int receiver_id) { char thread_name[50]; - snprintf(thread_name, sizeof(thread_name), "R%d.%d", id, receiver_id); + snprintf(thread_name, sizeof(thread_name), "R%d.%d", node_id, receiver_id); time_trace::thread_buffer thread_buffer(thread_name); homa::receiver receiver(fd, buf_region); @@ -2069,8 +2261,10 @@ uint64_t homa_client::measure_rtt(int server, int length, char *buffer, homa::receiver *receiver) { message_header *header = reinterpret_cast(buffer); + struct homa_sendmsg_args homa_args; + struct msghdr msghdr; + struct iovec vec; uint64_t start; - uint64_t rpc_id; int status; header->length = length; @@ -2078,26 +2272,32 @@ uint64_t homa_client::measure_rtt(int server, int length, char *buffer, header->length = HOMA_MAX_MESSAGE_LENGTH; if (header->length < sizeof32(*header)) header->length = sizeof32(*header); - header->cid = server_ids[server]; + header->cid = server_conns[server]; header->cid.client_port = id; start = rdtsc(); - status = homa_send(fd, buffer, header->length, - &server_addrs[server], &rpc_id, 0); + + vec.iov_base = buffer; + vec.iov_len = header->length; + init_sendmsg_hdrs(&msghdr, &homa_args, &vec, 1, + &server_addrs[server].sa, + sockaddr_size(&server_addrs[server].sa)); + status = sendmsg(fd, &msghdr, 0); if (status < 0) { - log(NORMAL, "FATAL: error in homa_send: %s (request " + log(NORMAL, "FATAL: error in Homa sendmsg: %s (request " "length %d)\n", strerror(errno), header->length); - exit(1); + fatal(); } do { - status = receiver->receive(0, rpc_id); + status = receiver->receive(0, homa_args.id); } while ((status < 0) && ((errno == EAGAIN) || (errno == EINTR))); if (status < 0) { log(NORMAL, "FATAL: measure_rtt got error in recvmsg: %s " - "(id %lu, server %s)\n", - strerror(errno), rpc_id, - print_address(receiver->src_addr())); - exit(1); + "(id %llu, server %s)\n", + strerror(errno), homa_args.id, + print_address((union sockaddr_in_union *) + receiver->src_addr())); + fatal(); } return rdtsc() - start; } @@ -2156,7 +2356,7 @@ void homa_client::measure_unloaded(int count) */ class tcp_client : public client { public: - tcp_client(int id); + tcp_client(int id, std::string& experiment); virtual ~tcp_client(); void read(tcp_connection *connection, int pid); void receiver(int id); @@ -2220,10 +2420,11 @@ class tcp_client : public client { /** * tcp_client::tcp_client() - Constructor for tcp_client objects. * - * @id: Unique identifier for this client (index starting at 0?) + * @id: Unique identifier for this client (index starting at 0?) + * @experiment: Name of experiment in which this client will participate. */ -tcp_client::tcp_client(int id) - : client(id) +tcp_client::tcp_client(int id, std::string& experiment) + : client(id, experiment) , connections() , blocked() , bytes_sent() @@ -2235,8 +2436,8 @@ tcp_client::tcp_client(int id) , receiving_threads() , sending_thread() { - bytes_rcvd = new std::atomic[num_servers]; - for (size_t i = 0; i < num_servers; i++) { + bytes_rcvd = new std::atomic[server_addrs.size()]; + for (size_t i = 0; i < server_addrs.size(); i++) { bytes_sent.push_back(0); bytes_rcvd[i] = 0; } @@ -2244,7 +2445,7 @@ tcp_client::tcp_client(int id) if (epoll_fd < 0) { log(NORMAL, "FATAL: tcp_client couldn't create epoll " "instance: %s\n", strerror(errno)); - exit(1); + fatal(); } for (uint32_t i = 0; i < server_addrs.size(); i++) { @@ -2253,7 +2454,7 @@ tcp_client::tcp_client(int id) log(NORMAL, "FATAL: couldn't open TCP client " "socket: %s\n", strerror(errno)); - exit(1); + fatal(); } if (connect(fd, reinterpret_cast( &server_addrs[i]), @@ -2262,7 +2463,7 @@ tcp_client::tcp_client(int id) "to %s: %s\n", print_address(&server_addrs[i]), strerror(errno)); - exit(1); + fatal(); } int flag = 1; setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &flag, sizeof(flag)); @@ -2271,7 +2472,7 @@ tcp_client::tcp_client(int id) "to server %s: %s", print_address(&server_addrs[i]), strerror(errno)); - exit(1); + fatal(); } sockaddr_in_union addr; socklen_t length = sizeof(addr); @@ -2279,7 +2480,7 @@ tcp_client::tcp_client(int id) &length)) { log(NORMAL, "FATAL: getsockname failed for TCP client: " "%s\n", strerror(errno)); - exit(1); + fatal(); } connections.emplace_back(new tcp_connection(fd, i, ntohs(addr.in4.sin_port), server_addrs[i])); @@ -2315,7 +2516,7 @@ tcp_client::~tcp_client() if (pipe2(fds, 0) < 0) { log(NORMAL, "FATAL: couldn't create pipe to shutdown TCP " "server: %s\n", strerror(errno)); - exit(1); + fatal(); } struct epoll_event ev; ev.events = EPOLLIN; @@ -2324,7 +2525,7 @@ tcp_client::~tcp_client() if (write(fds[1], "xxxx", 4) < 0) { log(NORMAL, "FATAL: couldn't write to TCP shutdown " "pipe: %s\n", strerror(errno)); - exit(1); + fatal(); } if (sending_thread) @@ -2397,11 +2598,12 @@ void tcp_client::sender() if ((header.length > HOMA_MAX_MESSAGE_LENGTH) && tcp_trunc) header.length = HOMA_MAX_MESSAGE_LENGTH; rinfos[slot].request_length = header.length; - header.cid = server_ids[server]; + header.cid = server_conns[server]; header.cid.client_port = id; header.msg_id = slot; header.freeze = freeze[header.cid.server]; header.short_response = one_way; + header.response = 0; size_t old_pending = connections[server]->pending(); tt("Sending TCP request, cid 0x%08x, id %u, length %d, pid %d", header.cid, header.msg_id, header.length, @@ -2468,7 +2670,7 @@ void tcp_client::receiver(int receiver_id) log(NORMAL, "FATAL: epoll_wait failed in tcp_client: " "%s\n", strerror(errno)); - exit(1); + fatal(); } tt("epoll_wait returned %d events in client pid %d", num_events, pid); @@ -2502,8 +2704,85 @@ void tcp_client::read(tcp_connection *connection, int pid) if (error) { log(NORMAL, "FATAL: %s (client)\n", connection->error_message); - exit(1); + fatal(); + } +} + +/** + * homa_info() - Use the HOMAIOCINFO ioctl to extract the status of a + * Homa socket and print the information to the log. + * @fd: File descriptor for a Homa socket. + */ +void log_homa_info(int fd) { +#define MAX_RPCS 1000 + struct homa_rpc_info *rpcs, *rinfo; + struct homa_info hinfo; + const char *priv; + std::string flags; + bool is_server; + int status; + + rpcs = new homa_rpc_info[MAX_RPCS]; + hinfo.rpc_info = rpcs; + hinfo.rpc_info_length = MAX_RPCS * sizeof(*rpcs); + status = ioctl(fd, HOMAIOCINFO, &hinfo); + if (status != 0) { + log(NORMAL, "HOMAIOCINFO failed for fd %d (%p): %s\n", fd, + &hinfo, strerror(errno)); + goto done; + } + log(NORMAL, " Homa info for port %d (fd %d):\n", hinfo.port, fd); + log(NORMAL, " Free bytes in rx buffer pool: %llu\n", + hinfo.bpool_avail_bytes); + log(NORMAL, " %d active RPCs\n", hinfo.num_rpcs); + for (__u32 i = 0; i < hinfo.num_rpcs; i++) { + rinfo = &rpcs[i]; + is_server = rinfo->id & 1; + if (rinfo->flags & HOMA_RPC_PRIVATE) + priv = "(private)"; + else + priv = ""; + log(NORMAL, " %s RPC id %llu%s:\n", is_server ? "Server" : "Client", + rinfo->id, priv); + log(NORMAL, " Peer: %s\n", print_address(reinterpret_cast< + union sockaddr_in_union *>(&rinfo->peer))); + if (!is_server) + log(NORMAL, " Completion cookie: %lld\n", + rinfo->completion_cookie); + if (rinfo->tx_length >= 0) + log(NORMAL, " Tx: %d/%d sent, %d granted, prio %u\n", + rinfo->tx_sent, rinfo->tx_length, + rinfo->tx_granted, rinfo->tx_prio); + else + log(NORMAL, " Tx: not yet initiated\n"); + if (rinfo->rx_length >= 0) { + char gap_info[100]; + const char *state; + + if (rinfo->rx_gaps != 0) + snprintf(gap_info, sizeof(gap_info), + ", %d gaps (%d missing bytes)", + rinfo->rx_gaps, + rinfo->rx_gap_bytes); + else + gap_info[0] = 0; + state = ""; + if (rinfo->flags & HOMA_RPC_BUF_STALL) + state = " (waiting for buffer space)"; + else if (rinfo->flags & HOMA_RPC_RX_COPY) + state = " (data available for copying to user space)"; + else if (rinfo->flags & (HOMA_RPC_RX_READY)) + state = " (queued waiting for recvmsg)"; + log(NORMAL, " Rx: %d/%d remaining, %d granted%s%s\n", + rinfo->rx_remaining, rinfo->rx_length, + rinfo->rx_granted, gap_info, state); + } else { + log(NORMAL, " Rx: no packets received yet\n"); + } } + +done: + delete[] rpcs; } /** @@ -2514,46 +2793,59 @@ void tcp_client::read(tcp_connection *connection, int pid) */ void server_stats(uint64_t now) { - char details[10000]; - int offset = 0; - int length; - uint64_t server_rpcs = 0; - uint64_t server_bytes_in = 0; - uint64_t server_bytes_out = 0; - details[0] = 0; - for (uint32_t i = 0; i < metrics.size(); i++) { - server_metrics *server = metrics[i]; - server_rpcs += server->requests; - server_bytes_in += server->bytes_in; - server_bytes_out += server->bytes_out; - length = snprintf(details + offset, sizeof(details) - offset, - "%s%lu", (offset != 0) ? " " : "", - server->requests - last_per_server_rpcs[i]); - offset += length; - if (i > last_per_server_rpcs.size()) - printf("last_per_server_rpcs has %lu entries, needs %lu\n", - last_per_server_rpcs.size(), - metrics.size()); - last_per_server_rpcs[i] = server->requests; - } - if ((last_stats_time != 0) && (server_bytes_in != last_server_bytes_in)) { - double elapsed = to_seconds(now - last_stats_time); - double rpcs = (double) (server_rpcs - last_server_rpcs); - double in_delta = (double) (server_bytes_in - - last_server_bytes_in); - double out_delta = (double) (server_bytes_out - - last_server_bytes_out); - log(NORMAL, "Servers: %.2f Kops/sec, %.2f Gbps in, " - "%.2f Gbps out, avg. req. length %.1f bytes\n", - rpcs/(1000.0*elapsed), - 8.0*in_delta/(1e09*elapsed), - 8.0*out_delta/(1e09*elapsed), - in_delta/rpcs); - log(NORMAL, "RPCs per server: %s\n", details); - } - last_server_rpcs = server_rpcs; - last_server_bytes_in = server_bytes_in; - last_server_bytes_out = server_bytes_out; + last_per_server_rpcs.resize(metrics.size(), 0); + last_server_rpcs.resize(experiments.size(), 0); + last_server_bytes_in.resize(experiments.size(), 0); + last_server_bytes_out.resize(experiments.size(), 0); + + for (size_t i = 0; i < experiments.size(); i++) { + std::string& exp = experiments[i]; + char details[10000]; + int offset = 0; + int length; + uint64_t server_rpcs = 0; + uint64_t server_bytes_in = 0; + uint64_t server_bytes_out = 0; + + details[0] = 0; + for (uint32_t i = 0; i < metrics.size(); i++) { + server_metrics *smetrics = metrics[i]; + if (smetrics->experiment != exp) + continue; + server_rpcs += smetrics->requests; + server_bytes_in += smetrics->bytes_in; + server_bytes_out += smetrics->bytes_out; + length = snprintf(details + offset, + sizeof(details) - offset, + "%s%lu", (offset != 0) ? " " : "", + smetrics->requests - last_per_server_rpcs[i]); + offset += length; + last_per_server_rpcs[i] = smetrics->requests; + } + if ((last_stats_time != 0) && (server_bytes_in + != last_server_bytes_in[i])) { + double elapsed = to_seconds(now - last_stats_time); + double rpcs = (double) (server_rpcs + - last_server_rpcs[i]); + double in_delta = (double) (server_bytes_in + - last_server_bytes_in[i]); + double out_delta = (double) (server_bytes_out + - last_server_bytes_out[i]); + log(NORMAL, "%s servers: %.2f Kops/sec, %.2f Gbps in, " + "%.2f Gbps out, avg. req. length " + "%.1f bytes\n", + exp.c_str(), + rpcs/(1000.0*elapsed), + 8.0*in_delta/(1e09*elapsed), + 8.0*out_delta/(1e09*elapsed), + in_delta/rpcs); + log(NORMAL, "RPCs per %s server thread: %s\n", + exp.c_str(), details); + } + last_server_rpcs[i] = server_rpcs; + last_server_bytes_in[i] = server_bytes_in; + last_server_bytes_out[i] = server_bytes_out; + } } /** @@ -2565,94 +2857,130 @@ void server_stats(uint64_t now) void client_stats(uint64_t now) { #define CDF_VALUES 100000 - uint64_t client_rpcs = 0; - uint64_t request_bytes = 0; - uint64_t response_bytes = 0; - uint64_t total_rtt = 0; - uint64_t lag = 0; - uint64_t outstanding_rpcs = 0; - uint64_t cdf_times[CDF_VALUES]; - uint64_t backups = 0; - int times_per_client; - int cdf_index = 0; - - if (clients.size() == 0) - return; + std::vector num_clients(sizeof(experiments), 0); + size_t i; - times_per_client = CDF_VALUES/clients.size(); - if (times_per_client > NUM_CLIENT_STATS) - times_per_client = NUM_CLIENT_STATS; for (client *client: clients) { - for (size_t i = 0; i < client->num_servers; i++) - client_rpcs += client->responses[i]; - request_bytes += client->request_bytes; - response_bytes += client->response_bytes; - total_rtt += client->total_rtt; - lag += client->lag; - outstanding_rpcs += client->total_requests - - client->total_responses; - for (int i = 1; i <= times_per_client; i++) { - /* Collect the most recent RTTs from the client for - * computing a CDF. - */ - int src = (client->total_responses - i) - % NUM_CLIENT_STATS; - if (client->actual_rtts[src] == 0) { - /* Client hasn't accumulated times_per_client - * entries yet; just use what it has. */ + for (i = 0; i < experiments.size(); i++) { + if (experiments[i] == client->experiment) break; + } + if (i == experiments.size()) + experiments.emplace_back(client->experiment); + num_clients[i]++; + } + + last_client_rpcs.resize(experiments.size(), 0); + last_client_bytes_out.resize(experiments.size(), 0); + last_client_bytes_in.resize(experiments.size(), 0); + last_total_rtt.resize(experiments.size(), 0); + last_lag.resize(experiments.size(), 0); + last_backups.resize(experiments.size(), 0); + + for (i = 0; i < experiments.size(); i++) { + std::string& exp = experiments[i]; + uint64_t client_rpcs = 0; + uint64_t request_bytes = 0; + uint64_t response_bytes = 0; + uint64_t total_rtt = 0; + uint64_t lag = 0; + uint64_t outstanding_rpcs = 0; + uint64_t cdf_times[CDF_VALUES]; + uint64_t backups = 0; + int times_per_client; + int cdf_index = 0; + + if (num_clients[i] == 0) + continue; + + times_per_client = CDF_VALUES/num_clients[i]; + if (times_per_client > NUM_CLIENT_STATS) + times_per_client = NUM_CLIENT_STATS; + for (client *client: clients) { + if (client->experiment != exp) + continue; + for (size_t j = 0; j < client->server_addrs.size(); j++) + client_rpcs += client->responses[j]; + request_bytes += client->request_bytes; + response_bytes += client->response_bytes; + total_rtt += client->total_rtt; + lag += client->lag; + outstanding_rpcs += client->total_requests + - client->total_responses; + for (int i = 1; i <= times_per_client; i++) { + /* Collect the most recent RTTs from the client + * for computing a CDF. + */ + int src = (client->total_responses - i) + % NUM_CLIENT_STATS; + if (client->actual_rtts[src] == 0) { + /* Client hasn't accumulated + * times_per_client entries yet; just + * use what it has. + */ + break; + } + cdf_times[cdf_index] = client->actual_rtts[src]; + cdf_index++; } - cdf_times[cdf_index] = client->actual_rtts[src]; - cdf_index++; + tcp_client *tclient = dynamic_cast(client); + if (tclient) + backups += tclient->backups; } - tcp_client *tclient = dynamic_cast(client); - if (tclient) - backups += tclient->backups; - } - std::sort(cdf_times, cdf_times + cdf_index); - if ((last_stats_time != 0) && (request_bytes != last_client_bytes_out)) { - double elapsed = to_seconds(now - last_stats_time); - double rpcs = (double) (client_rpcs - last_client_rpcs); - double delta_out = (double) (request_bytes - - last_client_bytes_out); - double delta_in = (double) (response_bytes - - last_client_bytes_in); - log(NORMAL, "Clients: %.2f Kops/sec, %.2f Gbps out, " - "%.2f Gbps in, RTT (us) P50 %.2f P99 %.2f " - "P99.9 %.2f, avg. req. length %.1f bytes\n", - rpcs/(1000.0*elapsed), - 8.0*delta_out/(1e09*elapsed), - 8.0*delta_in/(1e09*elapsed), - to_seconds(cdf_times[cdf_index/2])*1e06, - to_seconds(cdf_times[99*cdf_index/100])*1e06, - to_seconds(cdf_times[999*cdf_index/1000])*1e06, - delta_out/rpcs); - double lag_fraction; - if (lag > last_lag) - lag_fraction = (to_seconds(lag - last_lag)/elapsed) - / clients.size(); - else - lag_fraction = -(to_seconds(last_lag - lag)/elapsed) - / clients.size(); - if (lag_fraction >= .01) - log(NORMAL, "Lag due to overload: %.1f%%\n", - lag_fraction*100.0); - if (backups != 0) { - log(NORMAL, "Backed-up sends: %lu/%lu (%.1f%%)\n", - backups - last_backups, - client_rpcs - last_client_rpcs, - 100.0*(backups - last_backups) - /(client_rpcs - last_client_rpcs)); + std::sort(cdf_times, cdf_times + cdf_index); + if ((last_stats_time != 0) && ((request_bytes + != last_client_bytes_out[i]) + || (outstanding_rpcs != 0))){ + double elapsed = to_seconds(now - last_stats_time); + double rpcs = (double) (client_rpcs - last_client_rpcs[i]); + double delta_out = (double) (request_bytes + - last_client_bytes_out[i]); + double delta_in = (double) (response_bytes + - last_client_bytes_in[i]); + log(NORMAL, "%s clients: %.2f Kops/sec, %.2f Gbps out, " + "%.2f Gbps in, RTT (us) P50 %.2f " + "P99 %.2f P99.9 %.2f, avg. req. length " + "%.1f bytes\n", + exp.c_str(), + rpcs/(1000.0*elapsed), + 8.0*delta_out/(1e09*elapsed), + 8.0*delta_in/(1e09*elapsed), + to_seconds(cdf_times[cdf_index/2])*1e06, + to_seconds(cdf_times[99*cdf_index/100])*1e06, + to_seconds(cdf_times[999*cdf_index/1000])*1e06, + delta_out/rpcs); + double lag_fraction; + if (lag > last_lag[i]) + lag_fraction = (to_seconds(lag + - last_lag[i])/elapsed) + / num_clients[i]; + else + lag_fraction = -(to_seconds(last_lag[i] + - lag)/elapsed) / num_clients[i]; + if (lag_fraction >= .01) + log(NORMAL, "Lag due to overload for %s " + "experiment: %.1f%%\n", + exp.c_str(), lag_fraction*100.0); + if (backups != 0) { + log(NORMAL, "Backed-up %s sends: %lu/%lu (%.1f%%)\n", + exp.c_str(), + backups - last_backups[i], + client_rpcs - last_client_rpcs[i], + 100.0*(backups - last_backups[i]) + /(client_rpcs - last_client_rpcs[i])); + } } + if (outstanding_rpcs != 0) + log(NORMAL, "Outstanding client RPCs for %s " + "experiment: %lu\n", + exp.c_str(), outstanding_rpcs); + last_client_rpcs[i] = client_rpcs; + last_client_bytes_out[i] = request_bytes; + last_client_bytes_in[i] = response_bytes; + last_total_rtt[i] = total_rtt; + last_lag[i] = lag; + last_backups[i] = backups; } - if (outstanding_rpcs != 0) - log(NORMAL, "Outstanding client RPCs: %lu\n", outstanding_rpcs); - last_client_rpcs = client_rpcs; - last_client_bytes_out = request_bytes; - last_client_bytes_in = response_bytes; - last_total_rtt = total_rtt; - last_lag = lag; - last_backups = backups; } /** @@ -2667,8 +2995,19 @@ void log_stats() uint64_t now = rdtsc(); server_stats(now); client_stats(now); - last_stats_time = now; + +#if 0 + for (client *client: clients) { + homa_client *hclient = + dynamic_cast(client); + if (hclient == NULL) + continue; + log_homa_info(hclient->fd); + } + for (homa_server *server: homa_servers) + log_homa_info(server->fd); +#endif } } @@ -2680,16 +3019,20 @@ void log_stats() */ int client_cmd(std::vector &words) { + int first_server = 1; + int server_nodes = 1; + std::string servers; + std::string experiment; + + buf_bpages = 1000; client_iovec = false; client_max = 1; client_ports = 1; - first_port = 4000; - first_server = 1; + first_port = -1; inet_family = AF_INET; net_gbps = 0.0; port_receivers = 1; protocol = "homa"; - server_nodes = 1; tcp_trunc = true; one_way = false; unloaded = 0; @@ -2697,11 +3040,23 @@ int client_cmd(std::vector &words) for (unsigned i = 1; i < words.size(); i++) { const char *option = words[i].c_str(); - if (strcmp(option, "--client-max") == 0) { + if (strcmp(option, "--buf-bpages") == 0) { + if (!parse(words, i+1, &buf_bpages, option, "integer")) + return 0; + i++; + } else if (strcmp(option, "--client-max") == 0) { if (!parse(words, i+1, (int *) &client_max, option, "integer")) return 0; i++; + } else if (strcmp(option, "--exp") == 0) { + if ((i + 1) >= words.size()) { + printf("No value provided for %s\n", + option); + return 0; + } + experiment = words[i+1]; + i++; } else if (strcmp(option, "--first-port") == 0) { if (!parse(words, i+1, &first_port, option, "integer")) return 0; @@ -2715,7 +3070,7 @@ int client_cmd(std::vector &words) return 0; i++; } else if (strcmp(option, "--id") == 0) { - if (!parse(words, i+1, &id, option, "integer")) + if (!parse(words, i+1, &node_id, option, "integer")) return 0; i++; } else if (strcmp(option, "--iovec") == 0) { @@ -2752,6 +3107,13 @@ int client_cmd(std::vector &words) if (!parse(words, i+1, &server_ports, option, "integer")) return 0; i++; + } else if (strcmp(option, "--servers") == 0) { + if ((i + 1) >= words.size()) { + printf("No value provided for %s\n", option); + return 0; + } + servers = words[i+1]; + i++; } else if (strcmp(option, "--unloaded") == 0) { if (!parse(words, i+1, &unloaded, option, "integer")) return 0; @@ -2770,17 +3132,52 @@ int client_cmd(std::vector &words) return 0; } } - init_server_addrs(); + if (experiment.empty()) { + experiment = protocol; + experiment += "_"; + experiment += workload; + } + + /* Figure out which nodes to use for servers (--servers, + * --server-nodes, --first-server). + */ + server_ids.clear(); + if (!servers.empty()) { + std::vector ids; + + split(servers.c_str(), ',', ids); + for (std::string &id_string: ids) { + char *end; + int id = strtoul(id_string.c_str(), &end, 10); + if (*end != 0) { + printf("Bad server id '%s' in --servers " + "option '%s'\n", + id_string.c_str(), + servers.c_str()); + return 0; + } + server_ids.push_back(id); + } + } else { + for (int i = 0; i < server_nodes; i++) + server_ids.push_back(first_server + i); + } + client_port_max = client_max/client_ports; if (client_port_max < 1) client_port_max = 1; /* Create clients. */ for (int i = 0; i < client_ports; i++) { - if (strcmp(protocol, "homa") == 0) - clients.push_back(new homa_client(i)); - else - clients.push_back(new tcp_client(i)); + if (strcmp(protocol, "homa") == 0) { + if (first_port == -1) + first_port = 4000; + clients.push_back(new homa_client(i, experiment)); + } else { + if (first_port == -1) + first_port = 5000; + clients.push_back(new tcp_client(i, experiment)); + } } last_stats_time = 0; time_trace::cleanup(); @@ -2822,9 +3219,12 @@ int dump_times_cmd(std::vector &words) FILE *f; time_t now; char time_buffer[100]; + std::string exp; - if (words.size() != 2) { - printf("Wrong # args; must be 'dump_times file'\n"); + if (words.size() == 3) + exp = words[2]; + else if (words.size() != 2) { + printf("Wrong # args; must be 'dump_times file [experiment]'\n"); return 0; } f = fopen(words[1].c_str(), "w"); @@ -2837,16 +3237,19 @@ int dump_times_cmd(std::vector &words) time(&now); strftime(time_buffer, sizeof(time_buffer), "%Y-%m-%d %H:%M:%S", localtime(&now)); - fprintf(f, "# Round-trip times measured by cp_node at %s\n", - time_buffer); + fprintf(f, "# Round-trip times measured by cp_node at %s for " + "experiment %s\n", + time_buffer, exp.empty() ? "" : exp.c_str()); fprintf(f, "# --protocol %s, --workload %s, --gpbs %.1f --threads %d,\n", protocol, workload, net_gbps, client_ports); - fprintf(f, "# --server-nodes %d --server-ports %d, --client-max %d\n", - server_nodes, server_ports, client_max); + fprintf(f, "# --server-nodes %lu --server-ports %d, --client-max %d\n", + server_ids.size(), server_ports, client_max); fprintf(f, "# Length RTT (usec)\n"); for (client *client: clients) { - __u32 start = client->total_responses % NUM_CLIENT_STATS; - __u32 i = start; + if (!exp.empty() && (client->experiment != exp)) + continue; + uint32_t start = client->total_responses % NUM_CLIENT_STATS; + uint32_t i = start; while (1) { if (client->actual_rtts[i] != 0) { fprintf(f, "%8d %12.2f\n", @@ -2977,7 +3380,9 @@ int log_cmd(std::vector &words) */ int server_cmd(std::vector &words) { - first_port = 4000; + std::string experiment; + buf_bpages = 1000; + first_port = -1; inet_family = AF_INET; protocol = "homa"; port_threads = 1; @@ -2988,7 +3393,19 @@ int server_cmd(std::vector &words) for (unsigned i = 1; i < words.size(); i++) { const char *option = words[i].c_str(); - if (strcmp(option, "--first-port") == 0) { + if (strcmp(option, "--buf-bpages") == 0) { + if (!parse(words, i+1, &buf_bpages, option, "integer")) + return 0; + i++; + } else if (strcmp(option, "--exp") == 0) { + if ((i + 1) >= words.size()) { + printf("No value provided for %s\n", + option); + return 0; + } + experiment = words[i+1]; + i++; + } else if (strcmp(option, "--first-port") == 0) { if (!parse(words, i+1, &first_port, option, "integer")) return 0; i++; @@ -3022,21 +3439,30 @@ int server_cmd(std::vector &words) return 0; } } + if (experiment.empty()) { + experiment = protocol; + experiment += "_"; + experiment += workload; + } if (strcmp(protocol, "homa") == 0) { + if (first_port == -1) + first_port = 4000; for (int i = 0; i < server_ports; i++) { homa_server *server = new homa_server(first_port + i, - i, inet_family, port_threads); + i, inet_family, port_threads, + experiment); homa_servers.push_back(server); } } else { + if (first_port == -1) + first_port = 5000; for (int i = 0; i < server_ports; i++) { tcp_server *server = new tcp_server(first_port + i, - i, port_threads); + i, port_threads, experiment); tcp_servers.push_back(server); } } - last_per_server_rpcs.resize(server_ports*port_threads, 0); last_stats_time = 0; return 1; } @@ -3059,6 +3485,7 @@ int stop_cmd(std::vector &words) for (client *client: clients) client->stop_sender(); } else if (strcmp(option, "servers") == 0) { + log(NORMAL, "stop command deleting servers\n"); for (homa_server *server: homa_servers) delete server; homa_servers.clear(); @@ -3197,9 +3624,11 @@ void error_handler(int signal, siginfo_t* info, void* ucontext) void* caller_address = reinterpret_cast( uc->uc_mcontext.gregs[REG_RIP]); - log(NORMAL, "Signal %d (%s) at address %p from %p\n", + log(NORMAL, "ERROR: Signal %d (%s) at address %p from %p\n", signal, strsignal(signal), info->si_addr, caller_address); + tt("ERROR: Signal %d; freezing timetrace", signal); + time_trace::freeze(); const int max_frames = 128; void* return_addresses[max_frames]; @@ -3222,6 +3651,10 @@ void error_handler(int signal, siginfo_t* info, void* ucontext) log(NORMAL, "Backtrace:\n"); for (int i = 1; i < frames; ++i) log(NORMAL, "%s\n", symbols[i]); + log(NORMAL, "Writing time trace to error.tt\n"); + if (time_trace::print_to_file("error.tt")) + log(NORMAL, "ERROR: couldn't write time trace %s\n", + strerror(errno)); fflush(log_file); while(1) {} @@ -3238,13 +3671,13 @@ int main(int argc, char** argv) if (getrlimit(RLIMIT_NOFILE, &limits) != 0) { log(NORMAL, "FATAL: couldn't read file descriptor limits: " "%s\n", strerror(errno)); - exit(1); + fatal(); } limits.rlim_cur = limits.rlim_max; if (setrlimit(RLIMIT_NOFILE, &limits) != 0) { log(NORMAL, "FATAL: couldn't increase file descriptor limit: " "%s\n", strerror(errno)); - exit(1); + fatal(); } struct sigaction action; action.sa_sigaction = error_handler; @@ -3263,7 +3696,7 @@ int main(int argc, char** argv) for (int i = 1; i < argc; i++) words.emplace_back(argv[i]); if (!exec_words(words)) - exit(1); + fatal(); /* Instead of going interactive, just print stats. * every second. diff --git a/util/cp_server_ports b/util/cp_server_ports index bc03178d..dfc3842d 100755 --- a/util/cp_server_ports +++ b/util/cp_server_ports @@ -1,18 +1,7 @@ #!/usr/bin/python3 -# Copyright (c) 2020-2022 Stanford University -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2020-2022 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark measures the throughput of a single server as a # function of the number of receiving ports diff --git a/util/cp_tcp b/util/cp_tcp index f6fe40e1..35a59f43 100755 --- a/util/cp_tcp +++ b/util/cp_tcp @@ -1,18 +1,7 @@ #!/usr/bin/python3 -# Copyright (c) 2020-2022 Stanford University -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2020-2022 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark measures the performance of TCP by itself, with # no message truncation. @@ -92,7 +81,7 @@ for workload, bw in load_info: log("Generating slowdown plot for %s" % (workload)) title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), options.num_nodes, bw) - ax = start_slowdown_plot(title, 1000, exp) + ax = start_plot_vs_msg_length(title, 1000, exp) plot_slowdown(ax, exp, "p99", "%s P99" % (prot)) plot_slowdown(ax, exp, "p50", "%s P50" % (prot)) ax.legend() diff --git a/util/cp_tcp_config b/util/cp_tcp_config index 4dfba431..93db1517 100755 --- a/util/cp_tcp_config +++ b/util/cp_tcp_config @@ -1,21 +1,10 @@ #!/usr/bin/python3 -# Copyright (c) 2020-2022 Stanford University -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2020-2026 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark measures TCP and DCTCP while varying one or more -# aspects of Homa's configuration (such as duty cycle). +# aspects of Homa's configuration (such as number of server threads). # Type "cp_tcp_config --help" for documentation. from cperf import * @@ -23,92 +12,97 @@ from cperf import * load_info = [["w2", 3.2], ["w3", 14], ["w4", 20], ["w5", 20]] parser = get_parser(description= - 'Measures Homa slowdown as the number of available priority levels ' - 'varies.', + 'Measures TCP performance as a function of various system ' + 'configuration parameters.', usage='%(prog)s [options]') parser.add_argument('-c', '--config', dest='config', - choices=['cports', 'sports', 'threads'], + choices=['cports', 'max_nic_queue', 'nic_backlog', 'sports', + 'threads'], required = True, help='Aspect of configuration to change') -parser.add_argument('--tcp', dest='tcp', type=boolean, - default=True, help="Boolean value: indicates whether measurements " - "should be run on TCP (default: true)") parser.add_argument('--dctcp', dest='dctcp', type=boolean, default=False, help="Boolean value:: indicates whether measurements " "should be run on DCTCP (default: false)") options = parser.parse_args() init(options) -servers = range(0, options.num_nodes) -clients = range(0, options.num_nodes) if options.workload != "": load_info = [[options.workload, options.gbps]] specs = [] -if options.config == 'threads': - for client, server in [[3, 6], [4, 8], [5, 10], [6, 12], [7, 14]]: - o = copy.deepcopy(options) - o.tcp_server_ports = server - o.tcp_client_ports = client - name = "s%dc%d" % (server, client) - specs.append({'options': o, 'exp_name': name, 'label': name}) -elif options.config == 'cports': +if options.config == 'cports': for ports in [2, 3, 4, 6, 8]: - o = copy.deepcopy(options) - o.tcp_client_ports = ports - specs.append({'options': o, - 'exp_name': "cports%d" % (ports), - 'label': "%d client ports" % (ports)}) + specs.append({'exp_name': "cports%d" % (ports), + 'label': "%d client ports" % (ports), + 'options': ['tcp_client_ports', ports]}) +elif options.config == 'max_nic_queue': + # Vary the limit on length of any individual NIC queue + for usecs in [10, 20, 40, 80]: + specs.append({'exp_name': 'nicq_%d' % (usecs), + 'label': 'max_nic_queue_usecs %d' % (usecs), + 'sysctl': ['.net.homa.max_nic_queue_usecs', usecs]}) +elif options.config == 'nic_backlog': + for usec in [5, 10, 20, 40]: + specs.append({'exp_name': "nicq%d" % (usec), + 'label': r'NIC queue max %d µsec' % (usec), + 'sysctl': ['.net.homa.max_nic_est_backlog_usecs', usec]}) elif options.config == 'sports': for ports in [6, 9, 12, 15, 18]: - o = copy.deepcopy(options) - o.tcp_server_ports = ports - specs.append({'options': o, - 'exp_name': "sports%d" % (ports), - 'label': "%d server ports" % (ports)}) + specs.append({ 'exp_name': "sports%d" % (ports), + 'label': "%d server ports" % (ports), + 'options': ['tcp_server_ports', ports]}) +elif options.config == 'threads': + for client, server in [[3, 6], [4, 8], [5, 10], [6, 12], [7, 14]]: + name = "s%dc%d" % (server, client) + specs.append({'exp_name': name, + 'label': name, + 'options': ['tcp_server_ports', server, + 'tcp_client_ports', client]}) +# sysctl parameter name -> old value to restore. +old_values = {} if not options.plot_only: - congestion = get_sysctl_parameter("net.ipv4.tcp_congestion_control") + congestion = get_sysctl_parameter("net.ipv4.tcp_congestion_control", + options.nodes[0]) try: # For each workload, run a set of experiments with different # configurations. for workload, bw in load_info: - o = copy.deepcopy(options) - o.protocol = "homa" - o.workload = workload - o.client_ports = 1 - o.client_max = 1 - o.server_ports = 1 - o.server_nodes = 1 - o.first_server = 1 - o.unloaded = 500 - start_servers(range(1, 2), o) - run_experiment("unloaded_" + workload, range(0, 1), o) - for spec in specs: - o = options - if 'options' in spec: - o = spec['options'] + o = copy.deepcopy(options) o.protocol = "tcp" o.workload = workload o.gbps = bw/2.0 - start_servers(servers, o) - if options.tcp: - set_sysctl_parameter("net.ipv4.tcp_congestion_control", - "cubic", range(0, options.num_nodes)) - run_experiment("tcp_%s_%s" % (spec['exp_name'], workload), - clients, o) + if 'sysctl' in spec: + for i in range(0, len(spec['sysctl']), 2): + name = spec['sysctl'][i] + value = spec['sysctl'][i+1] + if name not in old_values: + old_values[name] = get_sysctl_parameter(name, + options.nodes[0]) + log("Setting %s = %s" % (name, value)) + set_sysctl_parameter(name, value, options.nodes) + if 'options' in spec: + for i in range(0, len(spec['options']), 2): + name = spec['options'][i] + value = spec['options'][i+1] + setattr(o, name, value) + exp_name = "%s_%s" % (spec['exp_name'], workload) + start_servers(exp_name, o.servers, o) if options.dctcp: set_sysctl_parameter("net.ipv4.tcp_congestion_control", - "dctcp", range(0, options.num_nodes)) - run_experiment("dctcp_%s_%s" % (spec['exp_name'], workload), - clients, o) + "dctcp", options.nodes) + run_experiment("%s_%s" % (spec['exp_name'], workload), + o.clients, o) except Exception as e: log(traceback.format_exc()) print("Resetting TCP congestion control to %s" % (congestion)) set_sysctl_parameter("net.ipv4.tcp_congestion_control", congestion, - range(0, options.num_nodes)) + options.nodes) + for name, value in old_values.items(): + print("Restoring %s to %s" % (name, value)) + set_sysctl_parameter(name, value, options.nodes) log("Stopping nodes") stop_nodes() @@ -116,66 +110,49 @@ if not options.plot_only: # Generate plots and reports for workload, bw in load_info: - set_unloaded("unloaded_" + workload) - log("Generating slowdown plots for %s" % (workload)) - if options.tcp: - title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), - options.num_nodes, bw) - ax = start_slowdown_plot(title, 1000, - "tcp_%s_%s" % (specs[0]['exp_name'], workload), - y_label="TCP Slowdown") - for spec in specs: - exp_name = "tcp_%s_%s" % (spec['exp_name'], workload) - plot_slowdown(ax, exp_name, "p99", spec['label']+' P99') - for spec in specs: - exp_name = "tcp_%s_%s" % (spec['exp_name'], workload) - plot_slowdown(ax, exp_name, "p50", spec['label']+' P50') - ax.legend(loc="upper right", prop={'size': 9}) - plt.tight_layout() - plt.savefig("%s/reports/tcp_%s_%s.pdf" % - (options.log_dir, options.config, workload)) - - if options.dctcp: - title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), - options.num_nodes, bw) - ax = start_slowdown_plot(title, 10000, - "dctcp_%s_%s" % (specs[0]['exp_name'], workload), - y_label="DCTCP Slowdown") - for spec in specs: - exp_name = "dctcp_%s_%s" % (spec['exp_name'], workload) - plot_slowdown(ax, exp_name, "p99", spec['label']+' P99') - for spec in specs: - exp_name = "dctcp_%s_%s" % (spec['exp_name'], workload) - plot_slowdown(ax, exp_name, "p50", spec['label']+' P50') - ax.legend(loc="upper right", prop={'size': 9}) - plt.tight_layout() - plt.savefig("%s/reports/dctcp_%s_%s.pdf" % - (options.log_dir, options.config, workload)) + title = "%s, %d %s nodes, %.1f Gbps" % (workload.capitalize(), + options.num_nodes, get_node_type(), bw) + ax = start_plot_vs_msg_length(title, 1000, + "%s_%s" % (specs[0]['exp_name'], workload), + y_label="Slowdown") + for spec in specs: + exp_name = "%s_%s" % (spec['exp_name'], workload) + plot_slowdown(ax, exp_name, "p99", spec['label']+' P99') + for spec in specs: + exp_name = "%s_%s" % (spec['exp_name'], workload) + plot_slowdown(ax, exp_name, "p50", spec['label']+' P50') + ax.legend(loc="upper right", prop={'size': 9}) + plt.tight_layout() + plt.savefig("%s/reports/%s_%s.pdf" % + (options.log_dir, options.config, workload)) + + log("Generating latency plots for %s" % (workload)) + title = "%s, %d %s nodes, %.1f Gbps" % (workload.capitalize(), + options.num_nodes, get_node_type(), bw) + ax = start_plot_vs_msg_length(title, [10, 10000], "%s_%s" % ( + specs[0]['exp_name'], workload), y_label=r'RTT (µsec)') + for spec in specs: + exp_name = "%s_%s" % (spec['exp_name'], workload) + plot_histogram(ax, exp_name, "p99", spec['label'] + ' P99') + for spec in specs: + exp_name = "%s_%s" % (spec['exp_name'], workload) + plot_histogram(ax, exp_name, "p50", spec['label'] + ' P50') + ax.legend(loc="lower right", prop={'size': 8}) + plt.tight_layout() + plt.savefig("%s/reports/%s_%s_rtt.pdf" % + (options.log_dir, options.config, workload)) log("Generating short message CDFs for %s" % (workload)) - if options.tcp: - title = "%s %d nodes" % (workload.capitalize(), options.num_nodes) - start_cdf_plot(title, 10, 0.99e05, 1e-05, "TCP RTT (usecs)", - "Cumulative Fraction of Short Messages") - for spec in specs: - exp_name = "tcp_%s_%s" % (spec['exp_name'], workload) - x, y = get_short_cdf(exp_name) - plt.plot(x, y, label=spec['label']) - - plt.legend(loc="upper right", prop={'size': 9}) - plt.savefig("%s/reports/tcp_%s_%s_cdfs.pdf" % - (options.log_dir, options.config, workload)) - - if options.dctcp: - title = "%s %d nodes" % (workload.capitalize(), options.num_nodes) - start_cdf_plot(title, 10, 0.99e05, 1e-05, "DCTCP RTT (usecs)", - "Cumulative Fraction of Short Messages") - for spec in specs: - exp_name = "dctcp_%s_%s" % (spec['exp_name'], workload) - x, y = get_short_cdf(exp_name) - plt.plot(x, y, label=spec['label']) - - plt.legend(loc="upper right", prop={'size': 9}) - plt.savefig("%s/reports/dctcp_%s_%s_cdfs.pdf" % - (options.log_dir, options.config, workload)) + title = "%s, %d %s nodes, %.1f Gbps" % (workload.capitalize(), + options.num_nodes, get_node_type(), bw) + start_cdf_plot(title, 10, 0.99e05, 1e-05, "RTT (usecs)", + "Cumulative Fraction of Short Messages") + for spec in specs: + exp_name = "%s_%s" % (spec['exp_name'], workload) + x, y = get_short_cdf(exp_name) + plt.plot(x, y, label=spec['label']) + + plt.legend(loc="upper right", prop={'size': 9}) + plt.savefig("%s/reports/%s_%s_cdfs.pdf" % + (options.log_dir, options.config, workload)) diff --git a/util/cp_vs_tcp b/util/cp_vs_tcp index 76ef5bf9..e6d29bd0 100755 --- a/util/cp_vs_tcp +++ b/util/cp_vs_tcp @@ -1,25 +1,15 @@ #!/usr/bin/python3 -# Copyright (c) 2020-2022 Stanford University -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2020-2023 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark compares the performance of Homa with TCP. # Type "cp_vs_tcp --help" for documentation. from cperf import * -load_info = [["w2", 3.2], ["w3", 14], ["w4", 20], ["w5", 20]] +# Workloads, bandwidths, and running times to use by default. +load_info = [["w2", 3.2, 5], ["w3", 14, 10], ["w4", 20, 20], ["w5", 20, 30]] parser = get_parser(description= 'Measures slowdown as a function of message size for Homa and TCP.', @@ -37,94 +27,78 @@ parser.add_argument('--servers', dest='num_servers', type=int, metavar='count', "server (default: 0)") options = parser.parse_args() init(options) -if options.num_servers <= 0: - servers = range(0, options.num_nodes) - clients = range(0, options.num_nodes) - options.num_clients = options.num_nodes - options.num_servers = options.num_nodes - bw_multiplier = 0.5 -else: + +bw_multiplier = 0.5 +if options.num_servers > 0: if options.num_servers >= options.num_nodes: raise Error("Illegal value %d for --servers option; must be less " - "than --nodes (%d)" % (options.num_servers, options.num_nodes)) - options.num_clients = options.num_nodes - options.num_servers - servers = range(0, options.num_servers) - clients = range(options.num_servers, options.num_nodes) + "than --nodes (%d)" % (options.num_servers, + options.num_nodes)) + options.servers = options.nodes[0:options.num_servers] + options.clients = options.nodes[options.num_servers:len(options.nodes)] options.server_ports = options.server_ports * 2 options.client_ports = options.client_ports * 2 options.tcp_server_ports = options.tcp_server_ports * 2 options.tcp_client_ports = options.tcp_client_ports * 2 - bw_multiplier = min(options.num_servers, options.num_clients) \ - / options.num_clients + bw_multiplier = min(len(options.servers), len(options.clients)) \ + / len(options.clients) if options.workload != "": - load_info = [[options.workload, options.gbps]] + load_info = [[options.workload, options.gbps, options.seconds]] # First, run all of the experiments if not options.plot_only: - congestion = get_sysctl_parameter("net.ipv4.tcp_congestion_control") - for workload, bw in load_info: + congestion = get_sysctl_parameter("net.ipv4.tcp_congestion_control", + options.nodes[0]) + for workload, bw, seconds in load_info: options.workload = workload options.gbps = bw * bw_multiplier - unloaded_exp = "unloaded_" + workload + options.seconds = seconds homa_exp = "homa_" + workload tcp_exp = "tcp_" + workload dctcp_exp = "dctcp_" + workload try: options.protocol = "homa" - start_servers(range(1, 2), options) - - o = copy.deepcopy(options) - o.gbps = 0.0 - o.client_ports = 1 - o.client_max = 1 - o.server_ports = 1 - o.server_nodes = 1 - o.first_server = 1 - o.unloaded = 500 - run_experiment(unloaded_exp, range(0, 1), o) - start_servers(servers, options) - run_experiment(homa_exp, clients, options) + start_servers(homa_exp, options.servers, options) + run_experiment(homa_exp, options.clients, options) if options.tcp: options.protocol = "tcp" set_sysctl_parameter("net.ipv4.tcp_congestion_control", - "cubic", range(0, options.num_nodes)) - start_servers(servers, options) - run_experiment(tcp_exp, clients, options) + "cubic", options.nodes) + start_servers(tcp_exp, options.servers, options) + run_experiment(tcp_exp, options.clients, options) if options.dctcp: options.protocol = "tcp" set_sysctl_parameter("net.ipv4.tcp_congestion_control", - "dctcp", range(0, options.num_nodes)) - start_servers(servers, options) - run_experiment(dctcp_exp, clients, options) + "dctcp", options.nodes) + start_servers(tcp_exp, options.servers, options) + run_experiment(dctcp_exp, options.clients, options) except Exception as e: log(traceback.format_exc()) if options.tcp or options.dctcp: print("Resetting TCP congestion control to %s" % (congestion)) set_sysctl_parameter("net.ipv4.tcp_congestion_control", congestion, - range(0, options.num_nodes)) + options.nodes) log("Stopping nodes") stop_nodes() scan_logs() # Generate plots and reports -for workload, bw in load_info: - unloaded_exp = "unloaded_" + workload +for workload, bw, seconds in load_info: homa_exp = "homa_" + workload tcp_exp = "tcp_" + workload dctcp_exp = "dctcp_" + workload - - set_unloaded(unloaded_exp) + scan_metrics(homa_exp) # Generate slowdown plot. log("Generating slowdown plot for %s" % (workload)) - title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), - options.num_nodes, bw) - ax = start_slowdown_plot(title, 1000, homa_exp) + title = "%s, %d %s nodes, %.1f Gbps" % (workload.capitalize(), + options.num_nodes, get_node_type(), bw) + ax = start_plot_vs_msg_length(title, 1000, homa_exp) if options.tcp: plot_slowdown(ax, tcp_exp, "p99", "TCP P99", color=tcp_color) plot_slowdown(ax, tcp_exp, "p50", "TCP P50", color=tcp_color2) @@ -135,11 +109,28 @@ for workload, bw in load_info: plot_slowdown(ax, homa_exp, "p50", "Homa P50", color=homa_color2) ax.legend(loc="upper right", prop={'size': 9}) plt.tight_layout() - plt.savefig("%s/reports/vs_tcp_%s.pdf" % (options.log_dir, workload)) + plt.savefig("%s/reports/slowdown_%s.pdf" % (options.log_dir, workload)) + + # Generate latency plot. + log("Generating RTT latency plot for %s" % (workload)) + title = "%s, %d %s nodes, %.1f Gbps" % (workload.capitalize(), + options.num_nodes, get_node_type(), bw) + ax = start_plot_vs_msg_length(title, [10, 10000], homa_exp, + y_label=r'RTT (µsec)') + if options.tcp: + plot_histogram(ax, tcp_exp, "p99", "TCP P99", color=tcp_color) + plot_histogram(ax, tcp_exp, "p50", "TCP P50", color=tcp_color2) + if options.dctcp: + plot_histogram(ax, dctcp_exp, "p99", "DCTCP P99", color=dctcp_color) + plot_histogram(ax, dctcp_exp, "p50", "DCTCP P50", color=dctcp_color2) + plot_histogram(ax, homa_exp, "p99", "Homa P99", color=homa_color) + plot_histogram(ax, homa_exp, "p50", "Homa P50", color=homa_color2) + ax.legend(loc="upper left", prop={'size': 9}) + plt.tight_layout() + plt.savefig("%s/reports/rtt_%s.pdf" % (options.log_dir, workload)) # Generate CDF of small-message RTTs. log("Generating short message CDF for %s" % (workload)) - unloaded_x, unloaded_y = get_short_cdf(unloaded_exp) homa_x, homa_y = get_short_cdf(homa_exp) if options.tcp: tcp_x, tcp_y = get_short_cdf(tcp_exp) @@ -152,6 +143,5 @@ for workload, bw in load_info: if options.dctcp: plt.plot(dctcp_x, dctcp_y, label="DCTCP", color=dctcp_color) plt.plot(homa_x, homa_y, label="Homa", color=homa_color) - plt.plot(unloaded_x, unloaded_y, label="Homa best case", color=unloaded_color) plt.legend(loc="upper right", prop={'size': 9}) plt.savefig("%s/reports/short_cdf_%s.pdf" % (options.log_dir, workload)) diff --git a/util/cperf.py b/util/cperf.py index a1a095b9..551c12f2 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -1,23 +1,13 @@ #!/usr/bin/python3 -# Copyright (c) 2020-2022 Stanford University -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2020-2023 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This file contains library functions used to run cluster performance # tests for the Linux kernel implementation of Homa. import argparse +from collections import defaultdict import copy import datetime import glob @@ -26,6 +16,7 @@ import matplotlib.pyplot as plt import numpy as np import os +from pathlib import Path import platform import re import shutil @@ -49,7 +40,7 @@ # If a server's id appears as a key in this dictionary, it means we # have started homa_prio running on that node. The value of each entry is # a Popen object for the homa_prio instance; if this is terminated, then -# the homa_prio process will end +# the homa_prio process will end. homa_prios = {} # The range of nodes currently running cp_node servers. @@ -64,30 +55,81 @@ # Indicates whether we should generate additional log messages for debugging verbose = False +# The --delete-rtts command-line option. +delete_rtts = False + +# The CloudLab node type for this node (e.g. xl170) +node_type = None + +# Value of the "--stripped" option. +stripped = False + +# Speed of host uplinks. +link_mbps = None + +# "Best possible RTT for short messages", depending on CloudLab node type. +# Used to compute slowdowns. +baseline_rtts = {"xl170": 15, "c6620": 25, "c6525-25g": 25, "c6525-100g": 25, + "default": 25} + # Defaults for command-line options; assumes that servers and clients -# share nodes. +# share nodes. Individual benchmarks may override some of these values. +# 'None' values will eventually be replaced with values from defaults_25g +# or defaults_100g. default_defaults = { 'gbps': 0.0, # Note: very large numbers for client_max hurt Homa throughput with # unlimited load (throttle queue inserts take a long time). 'client_max': 200, - 'client_ports': 3, + 'client_ports': None, 'log_dir': 'logs/' + time.strftime('%Y%m%d%H%M%S'), 'mtu': 0, 'no_trunc': '', 'protocol': 'homa', + 'port_receivers': None, + 'port_threads': None, + 'seconds': 30, + 'server_ports': None, + 'tcp_client_ports': None, + 'tcp_port_receivers': None, + 'tcp_server_ports': None, + 'tcp_port_threads': None, + 'unsched': 0, + 'unsched_boost': 0.0, + 'workload': '' +} + +# These defaults are used for 25 Gbps networks. +defaults_25g = { + 'client_ports': 3, 'port_receivers': 3, 'port_threads': 3, - 'seconds': 5, 'server_ports': 3, 'tcp_client_ports': 4, 'tcp_port_receivers': 1, + 'tcp_port_threads': 1, 'tcp_server_ports': 8, +} + +# These defaults are used for 100 Gbps networks. +defaults_100g = { + 'client_ports': 5, + 'port_receivers': 3, + 'port_threads': 3, + 'server_ports': 2, + 'tcp_client_ports': 10, + 'tcp_port_receivers': 1, 'tcp_port_threads': 1, - 'unloaded': 0, - 'unsched': 0, - 'unsched_boost': 0.0, - 'workload': '' + 'tcp_server_ports': 20, +} + +# Maps from CloudLab node type ('xl170', 'c6620', etc.) to defaults +# appropriate for that cluster type. +type_defaults = { + 'xl170': defaults_25g, + 'c6620': defaults_100g, + 'c6525-25g': defaults_25g, + 'c6525-100g': defaults_100g } # Keys are experiment names, and each value is the digested data for that @@ -106,12 +148,9 @@ # slow_50: List of 50th percentile slowdowns corresponding to each length # slow_99: List of 99th percentile slowdowns corresponding to each length # slow_999: List of 999th percentile slowdowns corresponding to each length +# avg_slowdown: Average slowdown across all messages of all sizes digests = {} -# A dictionary where keys are message lengths, and each value is the median -# unloaded RTT (usecs) for messages of that length. -unloaded_p50 = {} - # Keys are filenames, and each value is a dictionary containing data read # from that file. Within that dictionary, each key is the name of a column # within the file, and the value is a list of numbers read from the given @@ -160,6 +199,7 @@ def log(message): """ global log_file print(message) + log_file.write("%.9f " % (time.time())) log_file.write(message) log_file.write("\n") @@ -173,6 +213,7 @@ def vlog(message): global log_file, verbose if verbose: print(message) + log_file.write("%.9f " % (time.time())) log_file.write(message) log_file.write("\n") @@ -188,6 +229,26 @@ def get_parser(description, usage, defaults = {}): are defaults; used to modify the defaults for some of the options (there is a default default for each option). """ + + # Configure defaults for this particular node type (e.g. network speed) + p = Path("/var/emulab/boot/nodetype") + if p.is_file(): + type = p.read_text().strip() + if type in type_defaults: + node_defaults = type_defaults[type] + else: + print("Couldn't find option defaults for node type '%s'; " + "using 100 Gbps defaults" % (type), file=sys.stderr) + node_defaults = defaults_100g + else: + print("Couldn't read node type from /var/emulab/boot/nodetype; " + "using 100 Gbps defaults") + node_defaults = defaults_100g + for key, value in node_defaults.items(): + # Only set default if the application hasn't already specified a value + if default_defaults[key] == None: + default_defaults[key] = value + for key in default_defaults: if not key in defaults: defaults[key] = default_defaults[key] @@ -202,8 +263,8 @@ def get_parser(description, usage, defaults = {}): '(default: %.2f)' % (defaults['gbps'])) parser.add_argument('--client-max', type=int, dest='client_max', metavar='count', default=defaults['client_max'], - help='Maximum number of requests each client machine can have ' - 'outstanding at a time (divided evenly among its ports) ' + help='Maximum number of Homa requests each client machine can have ' + 'outstanding at a time (divided evenly among the Homa ports) ' '(default: %d)' % (defaults['client_max'])) parser.add_argument('--client-ports', type=int, dest='client_ports', metavar='count', default=defaults['client_ports'], @@ -214,6 +275,8 @@ def get_parser(description, usage, defaults = {}): help='Name to use for the cperf log file (default: cperf.log)') parser.add_argument('-d', '--debug', dest='debug', action='store_true', help='Pause after starting servers to enable debugging setup') + parser.add_argument('--delete-rtts', dest='delete_rtts', action='store_true', + help='Delete .rtt files after reading, in order to save disk space') parser.add_argument('-h', '--help', action='help', help='Show this help message and exit') parser.add_argument('-6', '--ipv6', dest='ipv6', action='store_const', @@ -254,6 +317,24 @@ def get_parser(description, usage, defaults = {}): metavar='count', default=defaults['server_ports'], help='Number of ports on which each server should listen ' '(default: %d)'% (defaults['server_ports'])) + parser.add_argument('--set-ids', dest='set_ids', type=boolean, + default=True, metavar="T/F", help="Boolean value: if true, the " + "next_id sysctl parameter will be set on each node in order to " + "avoid conflicting RPC ids on different nodes (default: true)") + parser.add_argument('--skip', dest='skip', + metavar='nodes', + help='List of node numbers not to use in the experiment; can ' + ' contain ranges, such as "3,5-8,12"') + parser.add_argument('--stripped', dest='stripped', type=boolean, + default=False, metavar="T/F", help='Boolean value: true means ' + 'Homa has been stripped for upstreaming, which means some ' + 'facilities are not available (default: false)') + parser.add_argument('--tcp-client-max', dest='tcp_client_max', type=int, + metavar='count', default=0, help="Maximum number of TCP requests " + "that can be outstanding from a client node at once (divided evenly " + "among the TCP ports); if zero, the " + "--client-max option is used for TCP as well (i.e. each protocol " + "can have that many outstanding requests) (default: 0)") parser.add_argument('--tcp-client-ports', type=int, dest='tcp_client_ports', metavar='count', default=defaults['tcp_client_ports'], help='Number of ports on which each TCP client should issue requests ' @@ -266,11 +347,15 @@ def get_parser(description, usage, defaults = {}): parser.add_argument('--tcp-port-threads', type=int, dest='tcp_port_threads', metavar='count', default=defaults['tcp_port_threads'], help='Number of threads listening on each TCP server port ' - '(default: %d)'% (defaults['port_threads'])) + '(default: %d)'% (defaults['tcp_port_threads'])) parser.add_argument('--tcp-server-ports', type=int, dest='tcp_server_ports', metavar='count', default=defaults['tcp_server_ports'], help='Number of ports on which TCP servers should listen ' '(default: %d)'% (defaults['tcp_server_ports'])) + parser.add_argument('--tt-freeze', dest='tt_freeze', type=boolean, + default=True, metavar="T/F", help="Boolean value: if true, " + "timetraces will be frozen on all nodes at the end of the " + "Homa benchmark run (default: true)") parser.add_argument('--unsched', type=int, dest='unsched', metavar='count', default=defaults['unsched'], help='If nonzero, homa_prio will always use this number of ' @@ -294,7 +379,8 @@ def init(options): """ Initialize various global state, such as the log file. """ - global log_dir, log_file, verbose + global log_dir, log_file, verbose, delete_rtts, link_mbps + global stripped log_dir = options.log_dir if not options.plot_only: if os.path.exists(log_dir): @@ -306,6 +392,29 @@ def init(options): vlog("cperf starting at %s" % (date_time)) s = "" + # Figure out which nodes to use for the experiment + skips = {} + if options.skip: + for spec in options.skip.split(","): + nodes = spec.split("-") + if len(nodes) == 1: + skips[int(spec)] = 1 + elif len(nodes) == 2: + for i in range(int(nodes[0]), int(nodes[1])+1): + skips[i] = 1 + else: + raise Exception("Bad skip range '%s': must be either id " + "or id1-id2" % (spec)) + nodes = [] + id = 0 + while len(nodes) != options.num_nodes: + if not id in skips: + nodes.append(id) + id += 1 + options.nodes = nodes + options.servers = options.nodes + options.clients = options.nodes + # Log configuration information, including options here as well # as Homa's configuration parameters. opts = vars(options) @@ -314,20 +423,30 @@ def init(options): s += ", " s += ("--%s: %s" % (name, str(opts[name]))) vlog("Options: %s" % (s)) - vlog("Homa configuration:") - for param in ['dead_buffs_limit', 'duty_cycle', 'grant_fifo_fraction', - 'grant_increment', 'gro_policy', 'link_mbps', 'max_dead_buffs', - 'max_gro_skbs', 'max_gso_size', 'max_nic_queue_ns', - 'max_overcommit', 'num_priorities', 'pacer_fifo_fraction', - 'poll_usecs', 'reap_limit', 'resend_interval', 'resend_ticks', - 'rtt_bytes', 'throttle_min_bytes', 'timeout_resends']: - result = subprocess.run(['sysctl', '-n', '.net.homa.' + param], - capture_output = True, encoding="utf-8") - vlog(" %-20s %s" % (param, result.stdout.rstrip())) + vlog("Homa configuration (node%d):" % (options.nodes[0])) + result = subprocess.run(['ssh', 'node%d' % (options.nodes[0]), + 'sysctl', '-a'], capture_output=True, encoding="iso8859-1") + if (result.returncode != 0): + log("sysctl -a on node%d exited with status %d:" % + (options.nodes[0], result.returncode)) + log(result.stderr.rstrip()) + for line in result.stdout.splitlines(): + match = re.match('.*net.homa.([^ ]+) = (.*)', line) + if match: + name = match.group(1) + value = match.group(2) + vlog(" %-20s %s" % (name, value)) + if name == 'link_mbps': + link_mbps = float(value) + if link_mbps == None: + link_mbps = 25000 if options.mtu != 0: log("Setting MTU to %d" % (options.mtu)) - do_ssh(["config", "mtu", str(options.mtu)], range(0, options.num_nodes)) + do_ssh(["config", "mtu", str(options.mtu)], options.nodes) + + if options.delete_rtts: + delete_rtts = True def wait_output(string, nodes, cmd, time_limit=10.0): """ @@ -335,6 +454,7 @@ def wait_output(string, nodes, cmd, time_limit=10.0): each of the nodes in the list given by nodes. If a long time goes by without the string appearing, an exception is thrown. string: The value to wait for + nodes: List of node ids from which output is expected cmd: Used in error messages to indicate the command that failed time_limit: An error will be generated if this much time goes by without the desired string appearing @@ -342,6 +462,7 @@ def wait_output(string, nodes, cmd, time_limit=10.0): global active_nodes outputs = [] printed = False + bad_node = -1 for id in nodes: while len(outputs) <= id: @@ -349,7 +470,8 @@ def wait_output(string, nodes, cmd, time_limit=10.0): start_time = time.time() while True: if time.time() > (start_time + time_limit): - raise Exception("timeout exceeded for command '%s'" % (cmd)) + raise Exception("timeout (%.1fs) exceeded for command '%s' on node%d" + % (time_limit, cmd, bad_node)) for id in nodes: data = active_nodes[id].stdout.read(1000) if data != None: @@ -357,7 +479,7 @@ def wait_output(string, nodes, cmd, time_limit=10.0): if print_data.endswith(string): print_data = print_data[:(len(data) - len(string))] if print_data != "": - log("output from node%d: '%s'" % (id, print_data)) + log("extra output from node%d: '%s'" % (id, print_data)) outputs[id] += data bad_node = -1 for id in nodes: @@ -376,43 +498,45 @@ def wait_output(string, nodes, cmd, time_limit=10.0): "expected '%s', got '%s'" % (bad_node, cmd, string, outputs[bad_node])) -def start_nodes(r, options): +def start_nodes(ids, options): """ - Start up cp_node on a group of nodes. + Start up cp_node on a group of nodes. Also starts homa_prio on the + nodes, if protocol is "homa". - r: The range of nodes on which to start cp_node, if it isn't already + ids: List of node ids on which to start cp_node, if it isn't already running - options: Command-line options that may affect experiment + options: Command-line options that may affect node configuration """ - global active_nodes + global active_nodes, homa_prios, verbose started = [] - for id in r: - if id in active_nodes: - continue - vlog("Starting cp_node on node%d" % (id)) - node = subprocess.Popen(["ssh", "-o", "StrictHostKeyChecking=no", - "node%d" % (id), "cp_node"], encoding="utf-8", - stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - fl = fcntl.fcntl(node.stdin, fcntl.F_GETFL) - fcntl.fcntl(node.stdin, fcntl.F_SETFL, fl | os.O_NONBLOCK) - fl = fcntl.fcntl(node.stdout, fcntl.F_GETFL) - fcntl.fcntl(node.stdout, fcntl.F_SETFL, fl | os.O_NONBLOCK) - active_nodes[id] = node - if not options.no_homa_prio: - f = open("%s/homa_prio-%d.log" % (log_dir,id), "w") - homa_prios[id] = subprocess.Popen(["ssh", "-o", - "StrictHostKeyChecking=no", "node%d" % (id), "sudo", - "bin/homa_prio", "--interval", "500", "--unsched", - str(options.unsched), "--unsched-boost", - str(options.unsched_boost)], encoding="utf-8", - stdout=f, stderr=subprocess.STDOUT) - f.close - started.append(id) + for id in ids: + if not id in active_nodes: + vlog("Starting cp_node on node%d" % (id)) + node = subprocess.Popen(["ssh", "-o", "StrictHostKeyChecking=no", + "node%d" % (id), "cp_node"], encoding="utf-8", + stdin=subprocess.PIPE, stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + fl = fcntl.fcntl(node.stdin, fcntl.F_GETFL) + fcntl.fcntl(node.stdin, fcntl.F_SETFL, fl | os.O_NONBLOCK) + fl = fcntl.fcntl(node.stdout, fcntl.F_GETFL) + fcntl.fcntl(node.stdout, fcntl.F_SETFL, fl | os.O_NONBLOCK) + active_nodes[id] = node + started.append(id) + if options.protocol == "homa": + if options.set_ids: + set_sysctl_parameter(".net.homa.next_id", + str(100000000*(id+1)), [id]) + if not options.no_homa_prio: + f = open("%s/homa_prio-%d.log" % (log_dir,id), "w") + homa_prios[id] = subprocess.Popen(["ssh", "-o", + "StrictHostKeyChecking=no", "node%d" % (id), "sudo", + "bin/homa_prio", "--interval", "500", "--unsched", + str(options.unsched), "--unsched-boost", + str(options.unsched_boost)], encoding="utf-8", + stdout=f, stderr=subprocess.STDOUT) + f.close wait_output("% ", started, "ssh") - log_level = "normal" - if verbose: - log_level = "verbose" + log_level = "verbose" if verbose else "normal" command = "log --file node.log --level %s" % (log_level) for id in started: active_nodes[id].stdin.write(command + "\n") @@ -425,13 +549,13 @@ def stop_nodes(): """ global active_nodes, server_nodes for id, popen in homa_prios.items(): - subprocess.run(["ssh", "-o", "StrictHostKeyChecking=no", + do_subprocess(["ssh", "-o", "StrictHostKeyChecking=no", "node%d" % id, "sudo", "pkill", "homa_prio"]) try: popen.wait(5.0) except subprocess.TimeoutExpired: log("Timeout killing homa_prio on node%d" % (id)) - for node in active_nodes.values(): + for id, node in active_nodes.items(): node.stdin.write("exit\n") try: node.stdin.flush() @@ -440,27 +564,27 @@ def stop_nodes(): for node in active_nodes.values(): node.wait(5.0) for id in active_nodes: - subprocess.run(["rsync", "-rtvq", "node%d:node.log" % (id), + do_subprocess(["rsync", "-rtvq", "node%d:node.log" % (id), "%s/node%d.log" % (log_dir, id)]) active_nodes.clear() server_nodes = range(0,0) -def do_cmd(command, r, r2 = range(0,0)): +def do_cmd(command, ids, ids2 = []): """ Execute a cp_node command on a given group of nodes. command: A command to execute on each node - r: A group of node ids on which to run the command (range, list, etc.) - r2: An optional additional group of node ids on which to run the - command; if a note is present in both r and r2, the + ids: List of node ids on which to run the command + ids2: An optional additional list of node ids on which to run the + command; if a node is present in both r and r2, the command will only be performed once """ global active_nodes nodes = [] - for id in r: + for id in ids: nodes.append(id) - for id in r2: - if id not in r: + for id in ids2: + if id not in ids: nodes.append(id) for id in nodes: vlog("Command for node%d: %s" % (id, command)) @@ -481,21 +605,20 @@ def do_ssh(command, nodes): """ vlog("ssh command on nodes %s: %s" % (str(nodes), " ".join(command))) for id in nodes: - subprocess.run(["ssh", "node%d" % id] + command, - stdout=subprocess.DEVNULL) + do_subprocess(["ssh", "node%d" % id] + command) -def get_sysctl_parameter(name): +def get_sysctl_parameter(name, node): """ Retrieve the value of a particular system parameter using sysctl on - the current host, and return the value as a string. + the given node, and return the value as a string. name: name of the desired configuration parameter + node: node number on which the value should be retrieved """ - output = subprocess.run(["sysctl", name], stdout=subprocess.PIPE, - encoding="utf-8").stdout.rstrip() + output = do_subprocess(["ssh", "node%d" % node, "sysctl", name]) match = re.match('.*= (.*)', output) if not match: - raise Error("Couldn't parse sysctl output: %s" % output) + raise Exception("Couldn't parse sysctl output: %s" % output) return match.group(1) def set_sysctl_parameter(name, value, nodes): @@ -507,38 +630,87 @@ def set_sysctl_parameter(name, value, nodes): nodes: specifies ids of the nodes on which to execute the command: should be a range, list, or other object that supports "in" """ + global stripped + if stripped: + vlog("Skipping set of Homa %s parameter to %s on nodes %s (Homa is stripped)" + % (name, value, str(nodes))) + return vlog("Setting Homa parameter %s to %s on nodes %s" % (name, value, str(nodes))) for id in nodes: - subprocess.run(["ssh", "node%d" % id, "sudo", "sysctl", - "%s=%s" % (name, value)], stdout=subprocess.DEVNULL) + do_subprocess(["ssh", "node%d" % id, "sudo", "sysctl", + "%s=%s" % (name, value)]) -def start_servers(r, options): +def get_baseline_rtt(): + """ + Return the "best possible" RTT for short messages, for use in computing + slowdowns. + """ + global baseline_rtts + + node_type = get_node_type() + if node_type in baseline_rtts: + return baseline_rtts[node_type] + return baseline_rtts["default"] + +def get_node_type(): + """ + Returns the node type for this machine. + """ + + global node_type + if node_type: + return node_type + f = open("/var/emulab/boot/nodetype") + node_type = f.read().strip() + f.close() + return node_type + +def do_subprocess(words): + """ + Invoke subprocess.run to run args in a child process and then + check the results. Log any errors that are detected. Returns + stdout from the child (with trailing newlines removed). + + words: List of words for the command to run. + """ + result = subprocess.run(words, capture_output=True, encoding="utf-8") + if (result.returncode != 0): + log("Command %s exited with status %d" % (words, result.returncode)) + if (result.stderr != ""): + log("Error output from %s: %s" % (words, result.stderr.rstrip())) + return result.stdout.rstrip() + +def start_servers(exp, ids, options): """ Starts cp_node servers running on a group of nodes - r: A group of node ids on which to start cp_node servers + exp: Name of experiment these servers will be part of + ids: A list of node ids on which to start cp_node servers options: A namespace that must contain at least the following keys, which will be used to configure the servers: server_ports port_threads protocol """ - global server_nodes - log("Starting %s servers %d:%d" % (options.protocol, r.start, r.stop-1)) + global server_nodes, stripped + stripped = options.stripped + log("Starting servers for %s experiment on nodes %s" % (exp, ids)) if len(server_nodes) > 0: do_cmd("stop servers", server_nodes) - server_nodes = range(0,0) - start_nodes(r, options) + server_nodes = [] + start_nodes(ids, options) if options.protocol == "homa": - do_cmd("server --ports %d --port-threads %d --protocol %s %s" % ( - options.server_ports, options.port_threads, - options.protocol, options.ipv6), r) + do_cmd("server --ports %d --port-threads %d --protocol %s --exp %s %s" + % (options.server_ports, options.port_threads, + options.protocol, exp, options.ipv6), ids) else: - do_cmd("server --ports %d --port-threads %d --protocol %s %s" % ( - options.tcp_server_ports, options.tcp_port_threads, - options.protocol, options.ipv6), r) - server_nodes = r + do_cmd("server --ports %d --port-threads %d --protocol %s --exp %s %s" + % (options.tcp_server_ports, options.tcp_port_threads, + options.protocol, exp, options.ipv6), ids) + server_nodes = ids + if options.debug: + input("Pausing for debug setup, type to continue: ") def run_experiment(name, clients, options): """ @@ -552,66 +724,60 @@ def run_experiment(name, clients, options): which control the experiment: client_max client_ports - first_server gbps port_receivers protocol seconds - server_nodes server_ports + servers tcp_client_ports tcp_server_ports workload """ - global active_nodes + global active_nodes, stripped + exp_nodes = list(set(options.servers + list(clients))) start_nodes(clients, options) nodes = [] - log("Starting %s experiment with clients %d:%d" % ( - name, clients.start, clients.stop-1)) - num_servers = len(server_nodes) - if "server_nodes" in options: - num_servers = options.server_nodes - first_server = server_nodes.start - if "first_server" in options: - first_server = options.first_server + log("Starting clients for %s experiment on nodes %s" % (name, clients)) for id in clients: if options.protocol == "homa": command = "client --ports %d --port-receivers %d --server-ports %d " \ - "--workload %s --server-nodes %d --first-server %d " \ - "--gbps %.3f --client-max %d --protocol %s --id %d %s" % ( + "--workload %s --servers %s --gbps %.3f --client-max %d " \ + "--protocol %s --id %d --exp %s %s" % ( options.client_ports, options.port_receivers, options.server_ports, options.workload, - num_servers, - first_server, + ",".join([str(x) for x in options.servers]), options.gbps, options.client_max, options.protocol, id, + name, options.ipv6) - if "unloaded" in options: - command += " --unloaded %d" % (options.unloaded) else: if "no_trunc" in options: trunc = '--no-trunc' else: trunc = '' + client_max = options.tcp_client_max + if not client_max: + client_max = options.client_max command = "client --ports %d --port-receivers %d --server-ports %d " \ - "--workload %s --server-nodes %d --first-server %d " \ - "--gbps %.3f %s --client-max %d --protocol %s --id %d %s" % ( + "--workload %s --servers %s --gbps %.3f %s --client-max %d " \ + "--protocol %s --id %d --exp %s %s" % ( options.tcp_client_ports, options.tcp_port_receivers, options.tcp_server_ports, options.workload, - num_servers, - first_server, + ",".join([str(x) for x in options.servers]), options.gbps, trunc, - options.client_max, + client_max, options.protocol, id, + name, options.ipv6) active_nodes[id].stdin.write(command + "\n") try: @@ -621,39 +787,59 @@ def run_experiment(name, clients, options): nodes.append(id) vlog("Command for node%d: %s" % (id, command)) wait_output("% ", nodes, command, 40.0) - if not "unloaded" in options: - if options.protocol == "homa": - # Wait a bit so that homa_prio can set priorities appropriately - time.sleep(2) + if options.protocol == "homa": + # Wait a bit so that homa_prio can set priorities appropriately + time.sleep(2) + if stripped: + vlog("Skipping initial read of metrics (Homa is stripped)") + else: vlog("Recording initial metrics") - for id in active_nodes: - subprocess.run(["ssh", "node%d" % (id), "metrics.py"], - stdout=subprocess.DEVNULL) - if not "no_rtt_files" in options: - do_cmd("dump_times /dev/null", clients) - do_cmd("log Starting %s experiment" % (name), server_nodes, clients) - debug_delay = 0 - if debug_delay > 0: - time.sleep(debug_delay) - if False and "dctcp" in name: - log("Setting debug info") - do_cmd("debug 2000 3000", clients) - log("Finished setting debug info") - time.sleep(options.seconds - debug_delay) - do_cmd("log Ending %s experiment" % (name), server_nodes, clients) + for id in exp_nodes: + do_subprocess(["ssh", "node%d" % (id), "metrics.py"]) + if options.protocol == "tcp" or options.protocol == "dctcp": + log("Waiting for TCP to warm up...") + time.sleep(10) + if not "no_rtt_files" in options: + do_cmd("dump_times /dev/null %s" % (name), clients) + if options.protocol == "homa" and options.tt_freeze: + log("Unfreezing timetraces on %s" % (nodes)) + set_sysctl_parameter(".net.homa.action", "10", nodes) + do_cmd("log Starting measurements for %s experiment" % (name), + server_nodes, clients) + log("Starting measurements") + debug_delay = 0 + if debug_delay > 0: + time.sleep(debug_delay) + if False and "dctcp" in name: + log("Setting debug info") + do_cmd("debug 2000 3000", clients) + log("Finished setting debug info") + time.sleep(options.seconds - debug_delay) + if options.protocol == "homa" and options.tt_freeze: + log("Freezing timetraces via node%d" % nodes[0]) + set_sysctl_parameter(".net.homa.action", "7", nodes[0:1]) + do_cmd("log Ending measurements for %s experiment" % (name), + server_nodes, clients) log("Retrieving data for %s experiment" % (name)) if not "no_rtt_files" in options: - do_cmd("dump_times rtts", clients) - if options.protocol == "homa": - vlog("Recording final metrics") - for id in active_nodes: - f = open("%s/%s-%d.metrics" % (options.log_dir, name, id), 'w') - subprocess.run(["ssh", "node%d" % (id), "metrics.py"], stdout=f) - f.close() - shutil.copyfile("%s/%s-%d.metrics" % (options.log_dir, name, first_server), - "%s/reports/%s-%d.metrics" % (options.log_dir, name, first_server)) - shutil.copyfile("%s/%s-%d.metrics" % (options.log_dir, name, clients[0]), - "%s/reports/%s-%d.metrics" % (options.log_dir, name, clients[0])) + do_cmd("dump_times rtts %s" % (name), clients) + if (options.protocol == "homa"): + if stripped: + vlog("Skipping final read of metrics (Homa is stripped)") + else: + vlog("Recording final metrics from nodes %s" % (exp_nodes)) + for id in exp_nodes: + f = open("%s/%s-%d.metrics" % (options.log_dir, name, id), 'w') + subprocess.run(["ssh", "node%d" % (id), "metrics.py"], stdout=f) + f.close() + shutil.copyfile("%s/%s-%d.metrics" % + (options.log_dir, name, options.servers[0]), + "%s/reports/%s-%d.metrics" % + (options.log_dir, name, options.servers[0])) + shutil.copyfile("%s/%s-%d.metrics" % + (options.log_dir, name, clients[0]), + "%s/reports/%s-%d.metrics" % + (options.log_dir, name, clients[0])) do_cmd("stop senders", clients) if False and "dctcp" in name: do_cmd("tt print cp.tt", clients) @@ -663,9 +849,180 @@ def run_experiment(name, clients, options): do_cmd("stop clients", clients) if not "no_rtt_files" in options: for id in clients: - subprocess.run(["rsync", "-rtvq", "node%d:rtts" % (id), + do_subprocess(["rsync", "-rtvq", "node%d:rtts" % (id), "%s/%s-%d.rtts" % (options.log_dir, name, id)]) +def run_experiments(*args): + """ + Run multiple experiments simultaneously and collect statistics. + + args: Each argument is a namespace describing an experiment to + run. The namespace must contain the following values: + name: The name of the experiment; used to create files + with the experiment's results. + clients: List of node numbers on which to run clients for the + experiment. + servers: List of node numbers on which to run servers for the + experiment (if the same server is in multiple + experiments, the parameters from the first experiment + are used to start the server). + protocol: tcp or homa + gbps + seconds + workload + + For Homa experiments the following values must be present: + client_max + client_ports + port_receivers + server_ports + port_threads + + For TCP experiments the following values must be present: + tcp_client_max (or client_max) + tcp_client_ports + tcp_server_ports + + There may be additional optional values that used if present. + """ + + global active_nodes, stripped + + homa_nodes = [] + homa_clients = [] + homa_servers= [] + tcp_nodes = [] + for exp in args: + if exp.protocol == "homa": + homa_clients.extend(exp.clients) + homa_nodes.extend(exp.clients) + homa_servers.extend(exp.servers) + homa_nodes.extend(exp.servers) + elif exp.protocol == "tcp": + tcp_nodes.extend(exp.clients) + tcp_nodes.extend(exp.servers) + homa_clients = sorted(list(set(homa_clients))) + homa_servers = sorted(list(set(homa_servers))) + homa_nodes = sorted(list(set(homa_nodes))) + tcp_nodes = sorted(list(set(tcp_nodes))) + all_nodes = sorted(list(set(homa_nodes + tcp_nodes))) + + # Start servers for all experiments + stop_nodes() + for exp in args: + if exp.servers: + log("Starting servers for %s experiment on nodes %s" % (exp.name, + exp.servers)) + start_nodes(exp.servers, exp) + if exp.protocol == "homa": + do_cmd("server --ports %d --port-threads %d --protocol homa " + "--exp %s %s" + % (exp.server_ports, exp.port_threads, + exp.name, exp.ipv6), exp.servers) + else: + do_cmd("server --ports %d --port-threads %d --protocol tcp " + "--exp %s %s" + % (exp.tcp_server_ports, exp.tcp_port_threads, + exp.name, exp.ipv6), exp.servers) + + # Start clients for all experiments + for exp in args: + log("Starting clients for %s experiment on nodes %s" % (exp.name, + exp.clients)) + start_nodes(exp.clients, exp) + for id in exp.clients: + if exp.protocol == "homa": + command = "client --ports %d --port-receivers %d --server-ports %d " \ + "--workload %s --servers %s --gbps %.3f --client-max %d " \ + "--protocol homa --id %d --exp %s %s" % ( + exp.client_ports, + exp.port_receivers, + exp.server_ports, + exp.workload, + ",".join([str(x) for x in exp.servers]), + exp.gbps, + exp.client_max, + id, + exp.name, + exp.ipv6) + else: + client_max = exp.tcp_client_max + if not client_max: + client_max = exp.client_max + command = "client --ports %d --port-receivers %d --server-ports %d " \ + "--workload %s --servers %s --gbps %.3f --client-max %d " \ + "--protocol tcp --id %d --exp %s %s" % ( + exp.tcp_client_ports, + exp.tcp_port_receivers, + exp.tcp_server_ports, + exp.workload, + ",".join([str(x) for x in exp.servers]), + exp.gbps, + client_max, + id, + exp.name, + exp.ipv6) + active_nodes[id].stdin.write(command + "\n") + try: + active_nodes[id].stdin.flush() + except BrokenPipeError: + log("Broken pipe to node%d while starting %s client" % (id, + exp.protocol)) + vlog("Command for node%d: %s" % (id, command)) + wait_output("% ", exp.clients, command, 40.0) + if homa_clients: + # Wait a bit so that homa_prio can set priorities appropriately + time.sleep(2) + if tcp_nodes: + log("Waiting for TCP to warm up...") + time.sleep(10) + if homa_nodes: + if stripped: + vlog("Skipping metrics initialization (Homa is stripped)") + else: + vlog("Initializing metrics") + do_ssh(["metrics.py > /dev/null"], homa_nodes) + do_cmd("dump_times /dev/null", all_nodes) + if homa_nodes and exp.tt_freeze: + log("Unfreezing timetraces on %s" % (all_nodes)) + set_sysctl_parameter(".net.homa.action", "10", all_nodes) + do_cmd("log Starting measurements", all_nodes) + log("Starting measurements") + + time.sleep(exp.seconds) + + # Collect results + if homa_nodes and exp.tt_freeze: + log("Freezing timetraces via node%d" % all_nodes[0]) + set_sysctl_parameter(".net.homa.action", "7", all_nodes[0:1]) + do_cmd("log Ending measurements", all_nodes) + log("Retrieving data") + for exp in args: + do_cmd("dump_times %s.rtts %s" % (exp.name, exp.name), exp.clients) + if homa_nodes: + if stripped: + vlog("Skipping final read of metrics (Homa is stripped)") + else: + vlog("Recording final metrics from nodes %s" % (homa_nodes)) + for id in homa_nodes: + f = open("%s/node%d.metrics" % (exp.log_dir, id), 'w') + subprocess.run(["ssh", "node%d" % (id), "metrics.py"], stdout=f) + f.close() + shutil.copyfile("%s/node%d.metrics" % + (exp.log_dir, homa_clients[0]), + "%s/reports/node%d.metrics" % + (exp.log_dir, homa_clients[0])) + shutil.copyfile("%s/node%d.metrics" % + (exp.log_dir, homa_servers[0]), + "%s/reports/node%d.metrics" % + (exp.log_dir, homa_servers[0])) + do_cmd("stop senders", all_nodes) + do_cmd("stop clients", all_nodes) + for exp in args: + for id in exp.clients: + do_subprocess(["rsync", "-rtvq", "node%d:%s.rtts" % (id, exp.name), + "%s/%s-%d.rtts" % (exp.log_dir, exp.name, id)]) + def scan_log(file, node, experiments): """ Read a log file and extract various useful information, such as fatal @@ -674,7 +1031,7 @@ def scan_log(file, node, experiments): file: Name of the log file to read node: Name of the node that generated the log, such as "node1". experiments: Info from the given log file is added to this structure - * At the top level it is dictionary indexed by experiment + * At the top level it is a dictionary indexed by experiment name, where * Each value is dictionary indexed by node name, where * Each value is a dictionary with keys such as client_kops, @@ -685,97 +1042,102 @@ def scan_log(file, node, experiments): exited = False experiment = "" node_data = None + active = False + timeouts = 0 for line in open(file): - match = re.match('.*Starting (.*) experiment', line) + if "FATAL:" in line: + log("%s: %s" % (file, line[:-1])) + exited = True + if "ERROR:" in line: + if "Homa RPC timed out" in line: + timeouts += 1 + if timeouts > 1: + continue + log("%s: %s" % (file, line[:-1])) + continue + if "cp_node exiting" in line: + exited = True + + match = re.match('.*Starting measurements', line) if match: - experiment = match.group(1) - if not experiment in experiments: - experiments[experiment] = {} - if not node in experiments[experiment]: - experiments[experiment][node] = {} - node_data = experiments[experiment][node] + active = True continue - if re.match('.*Ending .* experiment', line): - experiment = "" - if experiment != "": - gbps = -1.0 - match = re.match('.*Clients: ([0-9.]+) Kops/sec, ' + + match = re.match('.*Ending measurements', line) + if match: + active = False + continue + + if active: + match = re.match('[0-9.]+ (.*) clients: ([0-9.]+) Kops/sec, ' '([0-9.]+) Gbps.*P50 ([0-9.]+)', line) if match: - gbps = float(match.group(2)) - else: - match = re.match('.*Clients: ([0-9.]+) Kops/sec, ' - '([0-9.]+) MB/sec.*P50 ([0-9.]+)', line) - if match: - gbps = 8.0*float(match.group(2)) - if gbps >= 0.0: - if not "client_kops" in node_data: - node_data["client_kops"] = [] - node_data["client_kops"].append(float(match.group(1))) - if not "client_gbps" in node_data: - node_data["client_gbps"] = [] - node_data["client_gbps"].append(gbps) - if not "client_latency" in node_data: - node_data["client_latency"] = [] - node_data["client_latency"].append(float(match.group(3))) + node_data = experiments[match.group(1)][node] + gbps = float(match.group(3)) + if gbps >= 0.0: + if not "client_kops" in node_data: + node_data["client_kops"] = [] + node_data["client_kops"].append(float(match.group(2))) + if not "client_gbps" in node_data: + node_data["client_gbps"] = [] + node_data["client_gbps"].append(gbps) + if not "client_latency" in node_data: + node_data["client_latency"] = [] + node_data["client_latency"].append(float(match.group(4))) continue - gbps = -1.0 - match = re.match('.*Servers: ([0-9.]+) Kops/sec, ' + match = re.match('[0-9.]+ (.*) servers: ([0-9.]+) Kops/sec, ' '([0-9.]+) Gbps', line) if match: - gbps = float(match.group(2)) - else: - match = re.match('.*Servers: ([0-9.]+) Kops/sec, ' - '([0-9.]+) MB/sec', line) - if match: - gbps = 8.0*float(match.group(2)) - if gbps >= 0.0: - if not "server_kops" in node_data: - node_data["server_kops"] = [] - node_data["server_kops"].append(float(match.group(1))) - if not "server_gbps" in node_data: - node_data["server_gbps"] = [] - node_data["server_gbps"].append(gbps) + node_data = experiments[match.group(1)][node] + gbps = float(match.group(3)) + if gbps >= 0.0: + if not "server_kops" in node_data: + node_data["server_kops"] = [] + node_data["server_kops"].append(float(match.group(2))) + if not "server_gbps" in node_data: + node_data["server_gbps"] = [] + node_data["server_gbps"].append(gbps) continue - match = re.match('.*Outstanding client RPCs: ([0-9.]+)', line) + match = re.match('.*Outstanding client RPCs for (.*) ' + 'experiment: ([0-9.]+)', line) if match: + node_data = experiments[match.group(1)][node] if not "outstanding_rpcs" in node_data: node_data["outstanding_rpcs"] = [] - node_data["outstanding_rpcs"].append(int(match.group(1))) + node_data["outstanding_rpcs"].append(int(match.group(2))) continue - match = re.match('.*Backed-up sends: ([0-9.]+)/([0-9.]+)', line) + match = re.match('.*Backed-up (.*) sends: ([0-9.]+)/([0-9.]+)', + line) if match: + node_data = experiments[match.group(1)][node] if not "backups" in node_data: node_data["backups"] = [] - node_data["backups"].append(float(match.group(1)) - /float(match.group(2))) + total = float(match.group(3)) + if total > 0: + node_data["backups"].append(float(match.group(2))/total) continue - if "FATAL:" in line: - log("%s: %s" % (file, line[:-1])) - exited = True - if "ERROR:" in line: - log("%s: %s" % (file, line[:-1])) - if "cp_node exiting" in line: - exited = True if not exited: log("%s appears to have crashed (didn't exit)" % (node)) + if timeouts > 1: + log("%s: %d additional Homa RPC timeouts" % (file, timeouts-1)) def scan_logs(): """ - Read all of the nodespecific log files produced by a run, and + Read all of the node-specific log files produced by a run, and extract useful information. """ global log_dir, verbose - # This value is described in the header doc for scan_log. - experiments = {} + # Data collected so far for all experiments. See scan_log header + # comment for more info. + experiments = defaultdict(lambda : defaultdict(dict)) for file in sorted(glob.glob(log_dir + "/node*.log")): - node = re.match('.*/(node[0-9]+)\.log', file).group(1) + node = re.match(r'.*/(node[0-9]+)\.log', file).group(1) scan_log(file, node, experiments) for name, exp in experiments.items(): @@ -792,11 +1154,7 @@ def scan_logs(): vlog("\n%ss for %s experiment:" % (type.capitalize(), name)) for node in sorted(exp.keys()): if not gbps_key in exp[node]: - if name.startswith("unloaded"): - exp[node][gbps_key] = [0.0] - exp[node][kops_key] = [0.0] - else: - continue + continue gbps = exp[node][gbps_key] avg = sum(gbps)/len(gbps) vlog("%s: %.2f Gbps (%s)" % (node, avg, @@ -826,14 +1184,21 @@ def scan_logs(): vlog("%s average: %.1f Kops/sec" % (type.capitalize(), totals[kops_key]/len(averages))) + for key in ["client_gbps", "client_kops", "server_gbps", "server_kops"]: + if not key in totals: + log("%s missing in node log files for experiment %s" % ( + key, name)) + totals[key] = 0 + log("\nClients for %s experiment: %d nodes, %.2f Gbps, %.1f Kops/sec " "(avg per node)" % (name, len(nodes["client"]), totals["client_gbps"]/len(nodes["client"]), totals["client_kops"]/len(nodes["client"]))) - log("Servers for %s experiment: %d nodes, %.2f Gbps, %.1f Kops/sec " - "(avg per node)" % (name, len(nodes["server"]), - totals["server_gbps"]/len(nodes["server"]), - totals["server_kops"]/len(nodes["server"]))) + if len(nodes["server"]) > 0: + log("Servers for %s experiment: %d nodes, %.2f Gbps, %.1f Kops/sec " + "(avg per node)" % (name, len(nodes["server"]), + totals["server_gbps"]/len(nodes["server"]), + totals["server_kops"]/len(nodes["server"]))) log("Overall for %s experiment: %d nodes, %.2f Gbps, %.1f Kops/sec " "(avg per node)" % (name, len(nodes["all"]), (totals["client_gbps"] + totals["server_gbps"])/len(nodes["all"]), @@ -858,18 +1223,83 @@ def scan_logs(): % (100.0*sum(backups)/len(backups))) log("") -def read_rtts(file, rtts): +def scan_metrics(experiment): """ - Read a file generated by cp_node's "dump_times" command and add its - data to the information present in rtts. + Reads in all of the .metrics files generated by an experiment, + extracts a few interesting statistics, and logs message if some + nodes appear to have significantly different behavior than + others (to detect flakey nodes) + """ + + metrics_files = sorted(glob.glob(log_dir + ("/%s-*.metrics" % (experiment)))) + if len(metrics_files) == 0: + return + + metric_names = ({'packets_sent_RESEND', 'packets_rcvd_RESEND'}) + docs = {'cores': 'core utilization', + 'packets_sent_RESEND': 'outgoing resend requests', + 'packets_rcvd_RESEND': 'incoming resend requests'} + units = {'cores': '', + 'packets_sent_RESEND': '/s', + 'packets_rcvd_RESEND': '/s'} + thresholds = {'cores': 2, + 'packets_sent_RESEND': 5, + 'packets_rcvd_RESEND': 5} + # Keys are same as in docs above, values are dictionaries, in which + # keys are metric file names and values are the value of the corresponding + # metric name in that metrics file. + metrics = {} + for name in docs.keys(): + metrics[name] = {} + for file in metrics_files: + f = open(file) + for name in docs.keys(): + metrics[name][file] = 0 + for line in f: + match = re.match('Total Core Utilization *([0-9.]+)', line) + if match: + metrics['cores'][file] = float(match.group(1)) + continue + match = re.match(r'([^ ]+) +([0-9]+) +\( *([0-9.]+ *[MKG]?)/s', line) + if not match: + continue + name = match.group(1) + if name in metric_names: + metrics[name][file] = unscale_number(match.group(3)) + f.close() + outlier_count = 0 + for name in metrics: + values = sorted(metrics[name].values()) + median = values[len(values)//2] + if (median == 0) and name == 'cores': + log("Couldn't find core utilization in metrics files") + continue + for file, value in metrics[name].items(): + if (value >= thresholds[name]) and (value > 1.5*median): + log("Outlier %s in %s: %s vs. %s median" + % (docs[name], file, scale_number(value, units[name]), + scale_number(median, units[name]))) - file: Name of the log file. - rtts: Dictionary whose keys are message lengths; each value is a - list of all of the rtts recorded for that message length (in usecs) - Returns: The total number of rtts read from the file. +def read_rtts(file, rtts, min_rtt = 0.0, link_mbps = 0.0): + """ + Read a file generated by cp_node's "dump_times" command and add its + data to the information present in rtts. Also computes average slowdown + across all the data in this file. + + file: Name of the log file. + rtts: Dictionary whose keys are message lengths; each value is a + list of all of the rtts recorded for that message length + (in usecs) + min_rtt: If nonzero, gives the minimum possible RTT for a short RPC + (used to compute slowdowns) + link_mbps: Speed of the host's uplink in Mbps. + Returns: The total number of rtts read from the file, and also the + average slowdown from this file. If min_rtt is zero, then + the slowdown will be zero. """ - total = 0 + num_rtts = 0 + slowdown_sum = 0 f = open(file, "r") for line in f: stripped = line.strip() @@ -877,7 +1307,7 @@ def read_rtts(file, rtts): continue words = stripped.split() if (len(words) < 2): - print("Line in %s too short (need at least 2 columns): '%s'" % + log("Line in %s too short (need at least 2 columns): '%s'" % (file, line)) continue length = int(words[0]) @@ -886,9 +1316,13 @@ def read_rtts(file, rtts): rtts[length].append(usec) else: rtts[length] = [usec] - total += 1 + if min_rtt > 0: + slowdown_sum += usec/(min_rtt + length*8/link_mbps) + num_rtts += 1 f.close() - return total + if num_rtts == 0: + return 0, 0 + return num_rtts, slowdown_sum/num_rtts def get_buckets(rtts, total): """ @@ -907,36 +1341,18 @@ def get_buckets(rtts, total): buckets.append([length, cumulative/total]) return buckets -def set_unloaded(experiment): - """ - Compute the optimal RTTs for each message size. - - experiment: Name of experiment that measured RTTs under low load - """ - - # Find (or generate) unloaded data for comparison. - files = sorted(glob.glob("%s/%s-*.rtts" % (log_dir, experiment))) - if len(files) == 0: - raise Exception("Couldn't find %s RTT data" % (experiment)) - rtts = {} - for file in files: - read_rtts(file, rtts) - unloaded_p50.clear() - for length in rtts.keys(): - unloaded_p50[length] = sorted(rtts[length])[len(rtts[length])//2] - vlog("Computed unloaded_p50: %d entries" % len(unloaded_p50)) - def get_digest(experiment): """ - Returns an element of digest that contains data for a particular + Returns an element of digests that contains data for a particular experiment; if this is the first request for a given experiment, the - method reads the data for experiment and generates the digest. For + method reads the raw RTT data for experiment and generates the digest. For each new digest generated, a .data file is generated in the "reports" subdirectory of the log directory. experiment: Name of the desired experiment """ - global digests, log_dir, unloaded_p50 + global digests, log_dir, delete_rtts + global link_mbps if experiment in digests: return digests[experiment] @@ -953,6 +1369,9 @@ def get_digest(experiment): digest["slow_99"] = [] digest["slow_999"] = [] + avg_slowdowns = [] + baseline_rtt = get_baseline_rtt() + # Read in the RTT files for this experiment. files = sorted(glob.glob(log_dir + ("/%s-*.rtts" % (experiment)))) if len(files) == 0: @@ -961,13 +1380,25 @@ def get_digest(experiment): sys.stdout.write("Reading RTT data for %s experiment: " % (experiment)) sys.stdout.flush() for file in files: - digest["total_messages"] += read_rtts(file, digest["rtts"]) + count, slowdown = read_rtts(file, digest["rtts"], baseline_rtt, link_mbps) + digest["total_messages"] += count + avg_slowdowns.append([file, slowdown]) sys.stdout.write("#") sys.stdout.flush() - print("") - if len(unloaded_p50) == 0: - raise Exception("No unloaded data: must invoked set_unloaded") + if delete_rtts: + os.remove(file) + log("") + + # See if some nodes have anomalous performance. + overall_avg = 0.0 + for info in avg_slowdowns: + overall_avg += info[1] + overall_avg = overall_avg/len(avg_slowdowns) + for info in avg_slowdowns: + if (info[1] < 0.8*overall_avg) or (info[1] > 1.2*overall_avg): + log("Outlier alt-slowdown in %s: %.1f vs. %.1f overall average" + % (info[0], info[1], overall_avg)) rtts = digest["rtts"] buckets = get_buckets(rtts, digest["total_messages"]) @@ -976,9 +1407,10 @@ def get_digest(experiment): bucket_rtts = [] bucket_slowdowns = [] bucket_count = 0 - cur_unloaded = unloaded_p50[min(unloaded_p50.keys())] + slowdown_sum = 0.0 lengths = sorted(rtts.keys()) lengths.append(999999999) # Force one extra loop iteration + optimal = baseline_rtt + lengths[0]*8/link_mbps for length in lengths: if length > bucket_length: digest["lengths"].append(bucket_length) @@ -1002,13 +1434,28 @@ def get_digest(experiment): bucket_count = 0 bucket_length, bucket_cum_frac = buckets[next_bucket] next_bucket += 1 - if length in unloaded_p50: - cur_unloaded = unloaded_p50[length] + optimal = baseline_rtt + length*8/link_mbps bucket_count += len(rtts[length]) for rtt in rtts[length]: bucket_rtts.append(rtt) - bucket_slowdowns.append(rtt/cur_unloaded) - log("Digest finished for %s" % (experiment)) + slowdown = rtt/optimal + bucket_slowdowns.append(slowdown) + slowdown_sum += slowdown + + # Get stats for shortest 10% of messages + small_rtts = [] + small_count = 0 + for length in lengths: + small_rtts.extend(rtts[length]) + if len(small_rtts)/digest["total_messages"] > 0.1: + break + small_rtts.sort() + digest["avg_slowdown"] = slowdown_sum/digest["total_messages"] + log("%s has %d RPCs, avg slowdown %.2f, %d messages < %d bytes " + "(min %.1f us P50 %.1f us P99 %.1f us)" % (experiment, + digest["total_messages"], digest["avg_slowdown"], len(small_rtts), + length, small_rtts[0], small_rtts[len(small_rtts)//2], + small_rtts[99*len(small_rtts)//100])) dir = "%s/reports" % (log_dir) f = open("%s/reports/%s.data" % (log_dir, experiment), "w") @@ -1027,15 +1474,64 @@ def get_digest(experiment): digests[experiment] = digest return digest -def start_slowdown_plot(title, max_y, x_experiment, size=10, +def read_digest(file): + """ + Read digest data from a file return the parsed digest. All digest fields + are populated except rtts. + + file: Name of the file to read + """ + + digest = { + "total_messages": 0, + "lengths": [], + "cum_frac": [], + "counts": [], + "p50": [], + "p99": [], + "p999": [], + "s50": [], + "s99": [], + "s999": [] + } + line_num = 0 + f = open(file) + for line in f: + line_num += 1 + if line.startswith('#'): + continue + values = line.strip().split() + if len(values) != 9: + print("Line %d in %s had %d field(s), expected 9: %s" % + (line_num, file, len(values), line.rstrip()), + file=sys.stderr) + length, cum_frac, count, p50, p99, p999, s50, s99, s999 = values + count = int(count) + digest["total_messages"] += count + digest["lengths"].append(int(length)) + digest["cum_frac"].append(float(cum_frac)) + digest["counts"].append(count) + digest["p50"].append(float(p50)) + digest["p99"].append(float(p99)) + digest["p999"].append(float(p999)) + digest["s50"].append(float(s50)) + digest["s99"].append(float(s99)) + digest["s999"].append(float(s999)) + f.close() + return digest + +def start_plot_vs_msg_length(title, y_range, x_experiment, size=10, show_top_label=True, show_bot_label=True, figsize=[6,4], - y_label="Slowdown", show_upper_x_axis= True): + y_label="Slowdown", show_upper_x_axis=True): """ - Create a pyplot graph that will be used for slowdown data. Returns the - Axes object for the plot. + Create a pyplot graph that will be used to display some value as a + function of message size, with the x-axis scaled so that distance + corresponds to cumulative number of messages. title: Title for the plot; may be empty - max_y: Maximum y-coordinate + y_range: Either a single value giving maximum y-coordinate + (min will be 1) or a list containing min and max + values. The y-axis will be log-scale. x_experiment: Name of experiment whose rtt distribution will be used to label the x-axis of the plot. None means don't label the x-axis (caller will presumably invoke cdf_xaxis to do it). @@ -1053,11 +1549,16 @@ def start_slowdown_plot(title, max_y, x_experiment, size=10, ax.set_title(title, size=size) ax.set_xlim(0, 1.0) ax.set_yscale("log") - ax.set_ylim(1, max_y) + if isinstance(y_range, list): + min_y, max_y = y_range + else: + min_y = 1 + max_y = y_range + ax.set_ylim(min_y, max_y) ax.tick_params(right=True, which="both", direction="in", length=5) ticks = [] labels = [] - y = 1 + y = 10 ** (math.ceil(math.log10(min_y))) while y <= max_y: ticks.append(y) labels.append("%d" % (y)) @@ -1092,10 +1593,9 @@ def start_slowdown_plot(title, max_y, x_experiment, size=10, target_count = 0 tick = 0 digest = get_digest(x_experiment) - rtts = digest["rtts"] total = digest["total_messages"] - for length in sorted(rtts.keys()): - cumulative_count += len(rtts[length]) + for length, count in zip(digest["lengths"], digest["counts"]): + cumulative_count += count while cumulative_count >= target_count: ticks.append(target_count/total) if length < 1000: @@ -1187,7 +1687,7 @@ def plot_slowdown(ax, experiment, percentile, label, **kwargs): ax: matplotlib Axes object: info will be plotted here. experiment: Name of the experiment whose data should be graphed. - percentile: While percentile of slowdown to graph: must be "p50", "p99", + percentile: Which percentile of slowdown to graph: must be "p50", "p99", or "p999" label: Text to display in the graph legend for this curve kwargs: Additional keyword arguments to pass through to plt.plot @@ -1208,6 +1708,23 @@ def plot_slowdown(ax, experiment, percentile, label, **kwargs): % (percentile)) ax.plot(x, y, label=label, **kwargs) +def plot_histogram(ax, experiment, metric, label, **kwargs): + """ + Add a histogram to a plot created by start_plot_vs_msg_length(). + + ax: matplotlib Axes object: info will be plotted here. + experiment: Name of the experiment whose data should be graphed. + percentile: Metric from experiment to graph, such as "p50" for 50th + percentile latency or "slow_99" for 99th percentile + slowdown + label: Text to display in the graph legend for this curve + kwargs: Additional keyword arguments to pass through to plt.plot + """ + digest = get_digest(experiment) + x, y = make_histogram(digest["cum_frac"], digest[metric], + init=[0, digest[metric][0]], after=False) + ax.plot(x, y, label=label, **kwargs) + def start_cdf_plot(title, min_x, max_x, min_y, x_label, y_label, figsize=[5, 4], size=10, xscale="log", yscale="log"): """ @@ -1329,6 +1846,43 @@ def get_short_cdf(experiment): f.close() return [x, y] +def read_file_data(file): + """ + Reads data from a file and returns a dict whose keys are column names + and whose values are lists of values from the given column. + + file: Path to the file containing the desired data. The file consists + of initial line containing space-separated column names, followed + any number of lines of data. Blank lines and lines starting with + "#" are ignored. + """ + columns = {} + names = None + f = open(file) + for line in f: + fields = line.strip().split() + if len(fields) == 0: + continue + if fields[0] == '#': + continue + if not names: + names = fields + for n in names: + columns[n] = [] + else: + if len(fields) != len(names): + print("Bad line in %s: %s (expected %d columns, got %d)" + % (file, line.rstrip(), len(columns), len(fields))) + continue + for i in range(0, len(names)): + try: + value = float(fields[i]) + except ValueError: + value = fields[i] + columns[names[i]].append(value) + f.close() + return columns + def column_from_file(file, column): """ Return a list containing a column of data from a given file. @@ -1362,4 +1916,46 @@ def column_from_file(file, column): for i in range(0, len(columns)): data[columns[i]].append(float(fields[i])) data_from_files[file] = data - return data[column] \ No newline at end of file + return data[column] + +def scale_number(number, units): + """ + Return a string describing a number, but with a "K", "M", or "G" + suffix to keep the number small and readable. + + number: number to scale + units: additional units designation, such as "bps" or "/s" to add + """ + + if number > 1000000000: + return "%.1f G%s" % (number/1000000000.0, units) + if number > 1000000: + return "%.1f M%S" % (number/1000000.0, units) + elif (number > 1000): + return "%.1f K%s" % (number/1000.0, units) + else: + if units == "": + space = "" + else: + space = " " + return "%.1f%s%s" % (number, space, units) + +def unscale_number(number): + """ + Given a string representation of a number, which may have a "K", + "M", or "G" scale factor (e.g. "1.2 M"), return the actual number + (e.g. 1200000). + """ + match = re.match("([0-9.]+) *([GMK]?)$", number) + if not match: + raise Exception("Couldn't unscale '%s': bad syntax" % (number)) + mantissa = float(match.group(1)) + scale = match.group(2) + if scale == 'G': + return mantissa * 1e09 + elif scale == 'M': + return mantissa * 1e06 + elif scale == 'K': + return mantissa * 1e03 + else: + return mantissa \ No newline at end of file diff --git a/util/diff_metrics.py b/util/diff_metrics.py index 397f2e93..9e39a043 100755 --- a/util/diff_metrics.py +++ b/util/diff_metrics.py @@ -1,18 +1,7 @@ #!/usr/bin/python3 -# Copyright (c) 2018-2022 Stanford University -# -# Permission to use, copy, modify, and distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2018-2022 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ This program reads 2 Homa metrics files (/proc/net/homa_metrics) diff --git a/util/diff_rtts.py b/util/diff_rtts.py new file mode 100755 index 00000000..21dd0f25 --- /dev/null +++ b/util/diff_rtts.py @@ -0,0 +1,168 @@ +#!/usr/bin/python3 + +# Copyright (c) 2023 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +""" +Compare two .rtts files to identify differences between them. + +Usage: diff_rtts.py file1 file2 +""" + +from __future__ import division, print_function +from glob import glob +from operator import itemgetter +from optparse import OptionParser +import math +import os +import re +import string +import sys + +def read_rtts(file): + """ + Read a .rtts file and returns a list of (length, slowdown) pairs. + + file: Name of file to read + """ + + slowdowns = [] + f = open(file) + for line in f: + if line.startswith('#') or not line: + continue + match = re.match(' *([0-9]+) +([0-9.]+)', line) + if not match: + raise Exception("Malformed line in .rtts file: %s" % (line.rstrip())) + length = int(match.group(1)) + rtt = float(match.group(2)) + + # Optimal time (usecs) assumes 13 usec minimum, 25 Gbps network + optimal = 13.0 + length*8/25000.0 + slowdown = rtt/optimal + slowdowns.append([length, slowdown]) + f.close() + return slowdowns + +def avg_slowdown(slowdowns): + """ + Return average slowdown from a list of (length, slowdown) pairs. + + slowdowns: Input list + """ + sum = 0.0 + for item in slowdowns: + sum += item[1] + return sum/len(slowdowns) + +def deciles(slowdowns): + """ + Given a list of (length, slowdown) pairs, divide into 10 groups by + length, then returns 6 lists (each with one entry per decile), + containing: + * largest length in the decile + * P50 slowdown for the decile + * P90 slowdown for the decile + * P99 slowdown for the decile + * P99.9 slowdown for the decile + * max slowdown for the decile + """ + p50 = [] + p90 = [] + p99 = [] + p999 = [] + max = [] + cutoffs = [] + s = sorted(slowdowns, key = itemgetter(0)) + for split in range(1, 11): + split_start = len(s)*(split-1)//10 + split_end = len(s)*split//10 + decile = [] + for i in range(split_start, split_end): + decile.append(s[i][1]) + cutoffs.append(s[split_end-1][0]) + decile = sorted(decile) + p50.append(decile[len(decile)//2]) + p90.append(decile[len(decile)*9//10]) + p99.append(decile[len(decile)*99//100]) + p999.append(decile[len(decile)*999//1000]) + max.append(decile[-1]) + return cutoffs, p50, p90, p99, p999, max + + +if len(sys.argv) != 3: + print("Usage: diff_rtts.py file1 file2") + exit(1) +f1 = sys.argv[1] +f2 = sys.argv[2] + +s1 = read_rtts(f1) +print("Average slowdown in %s: %.1f" % (f1, avg_slowdown(s1))) + +s2 = read_rtts(sys.argv[2]) +print("Average slowdown in %s: %.1f" % (f2, avg_slowdown(s2))) +print("") + +c1, p50_1, p90_1, p99_1, p999_1, max_1 = deciles(s1) +c2, p50_2, p90_2, p99_2, p999_2, max_2 = deciles(s2) + +out = "" +for cutoff in c1: + out += " %d" % (cutoff) +print("Cutoffs for %s:%s" % (f1, out)) +out = "" +for cutoff in c2: + out += " %d" % (cutoff) +print("Cutoffs for %s:%s" % (f2, out)) +print("") + +out = "" +for val in p50_1: + out += " %5.1f" % (val) +print("P50s for %s:%s" % (f1, out)) +out = "" +for val in p50_2: + out += " %5.1f" % (val) +print("P50s for %s:%s" % (f2, out)) +print("") + +out = "" +for val in p90_1: + out += " %5.1f" % (val) +print("P90s for %s:%s" % (f1, out)) +out = "" +for val in p90_2: + out += " %5.1f" % (val) +print("P90s for %s:%s" % (f2, out)) +print("") + +out = "" +for val in p99_1: + out += " %5.1f" % (val) +print("P99s for %s:%s" % (f1, out)) +out = "" +for val in p99_2: + out += " %5.1f" % (val) +print("P99s for %s:%s" % (f2, out)) +print("") + +out = "" +for val in p999_1: + out += " %5.1f" % (val) +print("P99.9s for %s:%s" % (f1, out)) +out = "" +for val in p999_2: + out += " %5.1f" % (val) +print("P99.9s for %s:%s" % (f2, out)) +print("") + +out = "" +for val in max_1: + out += " %5.1f" % (val) +print("Maxes for %s:%s" % (f1, out)) +out = "" +for val in max_2: + out += " %5.1f" % (val) +print("Maxes for %s:%s" % (f2, out)) + +exit(0) \ No newline at end of file diff --git a/util/dist.cc b/util/dist.cc index a5a02f77..416eaa58 100644 --- a/util/dist.cc +++ b/util/dist.cc @@ -1,16 +1,5 @@ -/* Copyright (c) 2019-2023 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2019-2023 Homa Developers + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file contains the workload distributions from the Homa paper, plus @@ -23,13 +12,6 @@ #include "dist.h" -/* Forward declarations for built-in CDFs. */ -extern dist_point_gen::cdf_point w1[]; -extern dist_point_gen::cdf_point w2[]; -extern dist_point_gen::cdf_point w3[]; -extern dist_point_gen::cdf_point w4[]; -extern dist_point_gen::cdf_point w5[]; - /** * dist_point_gen() - Constructor for the dist_point generator class. Sets the * distribution for the object, potentially merging buckets to reduce the @@ -66,7 +48,7 @@ dist_point_gen::dist_point_gen(const char* dist, size_t max_length, return; } - cdf_point* points; + weight* points; if (strcmp(dist, "w1") == 0) { points = w1; } else if (strcmp(dist, "w2") == 0) { @@ -77,30 +59,44 @@ dist_point_gen::dist_point_gen(const char* dist, size_t max_length, points = w4; } else if (strcmp(dist, "w5") == 0) { points = w5; + } else if (strcmp(dist, "starve") == 0) { + points = starve; } else { fprintf(stderr, "Invalid workload %s; must be w1, " - "w2, w3, w4, w5, or a number\n", dist); + "w2, w3, w4, w5, starve, or a number\n", dist); abort(); } - /* Reduce the set of points according to min_bucket_frac and - * max_size_ratio. + /* Add up all the weights in the distribution. */ + double total_weight = 0.0; + for (weight *p = points; p->length > 0; p++) { + total_weight += p->freq; + } + + /* Convert from weights to cumulative fracions, and reduce the set + * of points according to min_bucket_frac and max_size_ratio. */ - for (cdf_point *p = points; ; p++) { + double cur_frac = 0.0; + dist_points.emplace_back(0, 0.0); + for (weight *p = points; ; p++) { + if (p->length == 0) { + dist_points.back().fraction = 1.0; + break; + } if (p->length >= max_length) { dist_points.emplace_back(max_length, 1.0); break; } - if (p->fraction >= 1.0) { + cur_frac += p->freq/total_weight; + if (cur_frac >= 1.0) { dist_points.emplace_back(p->length, 1.0); break; } - if (dist_points.empty() - || (p->fraction - dist_points.back().fraction - >= min_bucket_frac) - || (max_size_ratio*dist_points.back().length - < p[1].length)) { - dist_points.emplace_back(p->length, p->fraction); + if ((cur_frac - dist_points.back().fraction >= + min_bucket_frac) || + (max_size_ratio*dist_points.back().length < + p[1].length)) { + dist_points.emplace_back(p->length, cur_frac); } } @@ -205,2198 +201,2216 @@ std::vector dist_point_gen::values() const return output; } +/** + * CDF_fractions() - Returns a vector containing the cdf fraction of + * every point in this distribution. + */ +std::vector dist_point_gen::cdf_fractions() const +{ + std::vector output; + output.reserve(dist_points.size()); + + for (const cdf_point &point: dist_points) { + output.push_back(point.fraction); + } + return output; +} + /* - * The following arrays store CDFs for Workloads 1-5 from the Homa - * SIGCOMM paper. + * The following arrays store distributions for Workloads 1-5 from the Homa + * SIGCOMM paper. The first number is a message length and the second + * number indicates the relative frequency of messages of that length. + * The end of each array is indicated by an entry with 0 for both length + * and frequency. */ -dist_point_gen::cdf_point w1[] = { - {8, 0.31094}, - {9, 0.31931}, - {10, 0.32768}, - {11, 0.41757}, - {12, 0.41849}, - {13, 0.42175}, - {14, 0.44155}, - {15, 0.47}, - {16, 0.474821816501057}, - {17, 0.477306734916455}, - {18, 0.478598446159319}, - {19, 0.481284092023641}, - {20, 0.484111102839223}, - {21, 0.485579552767652}, - {22, 0.488630276714586}, - {23, 0.490214146264239}, - {24, 0.493502936247448}, - {25, 0.495209487027664}, - {26, 0.496958569155332}, - {27, 0.500587629388271}, - {28, 0.502469263537327}, - {29, 0.504396739229229}, - {30, 0.506370885197163}, - {31, 0.508392528537904}, - {32, 0.510462493543295}, - {33, 0.512581600464071}, - {34, 0.514750664204364}, - {35, 0.516970492945357}, - {36, 0.519241886696665}, - {37, 0.52156563577416}, - {38, 0.52394251920312}, - {39, 0.526373303045767}, - {40, 0.528858738652444}, - {41, 0.531399560835934}, - {43, 0.533996485968624}, - {44, 0.536650210002549}, - {45, 0.539361406412584}, - {47, 0.542130724063425}, - {48, 0.544958785001327}, - {50, 0.547846182171941}, - {51, 0.550793477066002}, - {53, 0.553801197295066}, - {54, 0.556869834099928}, - {56, 0.559999839794875}, - {57, 0.563191625151417}, - {59, 0.566445556725714}, - {61, 0.569761954134462}, - {63, 0.573141087284622}, - {65, 0.576583173562996}, - {67, 0.580088374992285}, - {69, 0.583656795360951}, - {71, 0.58728847733488}, - {73, 0.590983399559541}, - {75, 0.594741473762058}, - {77, 0.598562541863314}, - {79, 0.602446373110951}, - {82, 0.606392661244829}, - {84, 0.610401021707236}, - {87, 0.614470988910841}, - {89, 0.618602013578049}, - {92, 0.622793460166102}, - {95, 0.627044604392854}, - {97, 0.631354630878766}, - {100, 0.635722630921181}, - {103, 0.640147600417399}, - {106, 0.644628437953517}, - {110, 0.649163943076279}, - {113, 0.653752814765475}, - {116, 0.658393650124541}, - {120, 0.663084943307071}, - {123, 0.667825084696898}, - {127, 0.672612360359169}, - {131, 0.677444951779569}, - {135, 0.682320935908353}, - {139, 0.687238285525227}, - {143, 0.692194869940407}, - {147, 0.697188456046198}, - {151, 0.702216709732398}, - {156, 0.707277197677561}, - {161, 0.712367389526724}, - {165, 0.71748466046463}, - {170, 0.722626294191701}, - {175, 0.727789486308116}, - {181, 0.73297134810924}, - {186, 0.738168910793445}, - {191, 0.743379130080985}, - {197, 0.748598891240071}, - {203, 0.753825014513709}, - {209, 0.759054260938132}, - {215, 0.764283338540864}, - {222, 0.769508908903652}, - {228, 0.774727594072556}, - {235, 0.779935983794678}, - {242, 0.785130643058093}, - {249, 0.790308119908749}, - {257, 0.795464953515364}, - {264, 0.800597682450724}, - {272, 0.805702853155265}, - {280, 0.810777028546545}, - {289, 0.815816796736054}, - {297, 0.820818779812929}, - {306, 0.825779642652527}, - {315, 0.830696101706428}, - {324, 0.835564933729451}, - {334, 0.840382984398504}, - {344, 0.845147176777796}, - {354, 0.84985451958491}, - {365, 0.854502115212638}, - {376, 0.859087167462276}, - {387, 0.863606988945202}, - {398, 0.86805900811114}, - {410, 0.872440775863436}, - {422, 0.876749971723942}, - {435, 0.880984409512793}, - {448, 0.885142042511289}, - {461, 0.889220968079412}, - {475, 0.893219431703032}, - {489, 0.897135830449671}, - {503, 0.900968715815669}, - {518, 0.904716795951764}, - {534, 0.908378937258383}, - {549, 0.911954165346268}, - {566, 0.915441665362468}, - {582, 0.91884078168609}, - {600, 0.922151017002482}, - {618, 0.925372030768756}, - {636, 0.928503637087566}, - {655, 0.931545802009937}, - {674, 0.934498640291545}, - {694, 0.937362411630225}, - {715, 0.940137516415517}, - {736, 0.942824491023827}, - {758, 0.945424002695147}, - {780, 0.947936844029309}, - {804, 0.950363927141405}, - {827, 0.952706277517256}, - {852, 0.954965027610696}, - {877, 0.957141410224929}, - {903, 0.959236751720322}, - {930, 0.961252465090754}, - {958, 0.963190042950019}, - {986, 0.965051050468866}, - {1015, 0.966837118301992}, - {1046, 0.968549935542785}, - {1077, 0.97019124274183}, - {1108, 0.971762825023174}, - {1141, 0.973266505330149}, - {1175, 0.974704137830182}, - {1210, 0.976077601505512}, - {1246, 0.977388793954181}, - {1283, 0.978639625422937}, - {1321, 0.97983201309107}, - {1360, 0.980967875621433}, - {1401, 0.982049127992267}, - {1442, 0.983077676620779}, - {1485, 0.984055414786898}, - {1529, 0.984984218363116}, - {1574, 0.985865941853998}, - {1621, 0.986702414746682}, - {1669, 0.987495438171598}, - {1719, 0.988246781870691}, - {1770, 0.988958181468629}, - {1822, 0.989631336040849}, - {1876, 0.990267905970848}, - {1932, 0.990869511087789}, - {1989, 0.991437729074415}, - {2048, 0.991974094134257}, - {2109, 0.992480095906333}, - {2172, 0.992957178614888}, - {2236, 0.993406740441211}, - {2303, 0.993830133104215}, - {2371, 0.994228661636218}, - {2441, 0.994603584340256}, - {2514, 0.994956112915281}, - {2588, 0.995287412735657}, - {2665, 0.995598603271606}, - {2744, 0.995890758637502}, - {2826, 0.996164908255274}, - {2909, 0.996422037620571}, - {2996, 0.996663089159829}, - {3085, 0.996888963166859}, - {3176, 0.997100518808131}, - {3270, 0.997298575186473}, - {3367, 0.997483912453533}, - {3467, 0.997657272961877}, - {3570, 0.997819362448284}, - {3676, 0.997970851240328}, - {3785, 0.998112375478974}, - {3897, 0.998244538350506}, - {4013, 0.99836791132166}, - {4132, 0.998483035372409}, - {4255, 0.998590422221387}, - {4381, 0.99869055553946}, - {4511, 0.998783892147428}, - {4645, 0.99887086319436}, - {4782, 0.998951875313439}, - {4924, 0.999027311752696}, - {5070, 0.999097533478328}, - {5221, 0.999162880248718}, - {5376, 0.999223671657584}, - {5535, 0.999280208145}, - {5699, 0.999332771975342}, - {5868, 0.999381628181432}, - {6043, 0.999427025474451}, - {6222, 0.999469197119345}, - {6406, 0.99950836177569}, - {6596, 0.999544724304136}, - {6792, 0.999578476538699}, - {6994, 0.999609798025316}, - {7201, 0.999638856727198}, - {7415, 0.999665809697595}, - {7635, 0.999690803720715}, - {7861, 0.999713975921559}, - {8094, 0.999735454345555}, - {8334, 0.99975535850886}, - {8582, 0.999773799920301}, - {8836, 0.999790882575893}, - {9098, 0.999806703426944}, - {9368, 0.999821352822742}, - {9646, 0.99983491492882}, - {9932, 0.999847468121828}, - {10227, 0.999859085361988}, - {10530, 0.999869834544144}, - {10843, 0.999879778828365}, - {11164, 0.999888976951074}, - {11496, 0.999897483517634}, - {11837, 0.999905349277299}, - {12188, 0.999912621381435}, - {12549, 0.999919343625857}, - {12922, 0.999925556678125}, - {13305, 0.999931298290599}, - {13700, 0.999936603500036}, - {14106, 0.99994150481446}, - {14524, 0.999946032388038}, - {14955, 0.999950214184624}, - {15399, 0.999954076130654}, - {15856, 0.999957642257982}, - {16326, 0.999960934837285}, - {16810, 0.999963974502583}, - {17309, 0.999966780367418}, - {17822, 0.999969370133206}, - {18351, 0.999971760190245}, - {18896, 0.999973965711837}, - {19456, 0.99997600074196}, - {20033, 0.999977878276901}, - {20627, 0.999979610341234}, - {21239, 0.999981208058518}, - {21869, 0.999982681717046}, - {22518, 0.999984040830983}, - {23186, 0.999985294197189}, - {23874, 0.999986449948021}, - {24582, 0.999987515600377}, - {25311, 0.999988498101245}, - {26062, 0.999989403869978}, - {26835, 0.999990238837541}, - {27631, 0.999991008482917}, - {28451, 0.999991717866879}, - {29295, 0.999992371663312}, - {30164, 0.999992974188243}, - {31059, 0.99999352942676}, - {31980, 0.999994041057951}, - {32929, 0.999994512478005}, - {33905, 0.999994946821621}, - {34911, 0.999995346981819}, - {35947, 0.9999957156283}, - {37013, 0.999996055224422}, - {38111, 0.99999636804293}, - {39242, 0.999996656180495}, - {40406, 0.999996921571177}, - {41604, 0.999997165998873}, - {42838, 0.999997391108826}, - {44109, 0.999997598418274}, - {45418, 0.999997789326289}, - {46765, 0.999997965122879}, - {48152, 0.999998126997391}, - {49580, 0.999998276046281}, - {51051, 0.999998413280298}, - {52565, 0.999998539631112}, - {54125, 0.99999865595744}, - {55730, 0.999998763050707}, - {57383, 0.999998861640266}, - {59086, 0.999998952398228}, - {60838, 0.99999903594391}, - {62643, 0.999999112847952}, - {64501, 0.99999918363611}, - {66415, 0.999999248792755}, - {68385, 0.99999930876411}, - {70413, 0.999999363961231}, - {72502, 0.999999414762756}, - {74653, 0.999999461517448}, - {76867, 0.999999504546531}, - {79147, 0.999999544145854}, - {81495, 0.999999580587881}, - {83912, 0.999999614123529}, - {86401, 0.999999644983861}, - {88964, 0.999999673381646}, - {91603, 0.999999699512802}, - {94321, 0.999999723557724}, - {97119, 0.999999745682501}, - {100000, 1.0}, +dist_point_gen::weight dist_point_gen::w1[] = { + {8, 0.310940000000}, + {9, 0.008370000000}, + {10, 0.008370000000}, + {11, 0.089890000000}, + {12, 0.000920000000}, + {13, 0.003260000000}, + {14, 0.019800000000}, + {15, 0.028450000000}, + {16, 0.004821816501}, + {17, 0.002484918415}, + {18, 0.001291711243}, + {19, 0.002685645864}, + {20, 0.002827010816}, + {21, 0.001468449928}, + {22, 0.003050723947}, + {23, 0.001583869550}, + {24, 0.003288789983}, + {25, 0.001706550780}, + {26, 0.001749082128}, + {27, 0.003629060233}, + {28, 0.001881634149}, + {29, 0.001927475692}, + {30, 0.001974145968}, + {31, 0.002021643341}, + {32, 0.002069965005}, + {33, 0.002119106921}, + {34, 0.002169063740}, + {35, 0.002219828741}, + {36, 0.002271393751}, + {37, 0.002323749077}, + {38, 0.002376883429}, + {39, 0.002430783843}, + {40, 0.002485435607}, + {41, 0.002540822183}, + {43, 0.002596925133}, + {44, 0.002653724034}, + {45, 0.002711196410}, + {47, 0.002769317651}, + {48, 0.002828060938}, + {50, 0.002887397171}, + {51, 0.002947294894}, + {53, 0.003007720229}, + {54, 0.003068636805}, + {56, 0.003130005695}, + {57, 0.003191785357}, + {59, 0.003253931574}, + {61, 0.003316397409}, + {63, 0.003379133150}, + {65, 0.003442086278}, + {67, 0.003505201429}, + {69, 0.003568420369}, + {71, 0.003631681974}, + {73, 0.003694922225}, + {75, 0.003758074203}, + {77, 0.003821068101}, + {79, 0.003883831248}, + {82, 0.003946288134}, + {84, 0.004008360462}, + {87, 0.004069967204}, + {89, 0.004131024667}, + {92, 0.004191446588}, + {95, 0.004251144227}, + {97, 0.004310026486}, + {100, 0.004368000042}, + {103, 0.004424969496}, + {106, 0.004480837536}, + {110, 0.004535505123}, + {113, 0.004588871689}, + {116, 0.004640835359}, + {120, 0.004691293183}, + {123, 0.004740141390}, + {127, 0.004787275662}, + {131, 0.004832591420}, + {135, 0.004875984129}, + {139, 0.004917349617}, + {143, 0.004956584415}, + {147, 0.004993586106}, + {151, 0.005028253686}, + {156, 0.005060487945}, + {161, 0.005090191849}, + {165, 0.005117270938}, + {170, 0.005141633727}, + {175, 0.005163192116}, + {181, 0.005181861801}, + {186, 0.005197562684}, + {191, 0.005210219288}, + {197, 0.005219761159}, + {203, 0.005226123274}, + {209, 0.005229246424}, + {215, 0.005229077603}, + {222, 0.005225570363}, + {228, 0.005218685169}, + {235, 0.005208389722}, + {242, 0.005194659263}, + {249, 0.005177476851}, + {257, 0.005156833607}, + {264, 0.005132728935}, + {272, 0.005105170705}, + {280, 0.005074175391}, + {289, 0.005039768190}, + {297, 0.005001983077}, + {306, 0.004960862840}, + {315, 0.004916459054}, + {324, 0.004868832023}, + {334, 0.004818050669}, + {344, 0.004764192379}, + {354, 0.004707342807}, + {365, 0.004647595628}, + {376, 0.004585052250}, + {387, 0.004519821483}, + {398, 0.004452019166}, + {410, 0.004381767752}, + {422, 0.004309195861}, + {435, 0.004234437789}, + {448, 0.004157632998}, + {461, 0.004078925568}, + {475, 0.003998463624}, + {489, 0.003916398747}, + {503, 0.003832885366}, + {518, 0.003748080136}, + {534, 0.003662141307}, + {549, 0.003575228088}, + {566, 0.003487500016}, + {582, 0.003399116324}, + {600, 0.003310235316}, + {618, 0.003221013766}, + {636, 0.003131606319}, + {655, 0.003042164922}, + {674, 0.002952838282}, + {694, 0.002863771339}, + {715, 0.002775104785}, + {736, 0.002686974608}, + {758, 0.002599511671}, + {780, 0.002512841334}, + {804, 0.002427083112}, + {827, 0.002342350376}, + {852, 0.002258750093}, + {877, 0.002176382614}, + {903, 0.002095341495}, + {930, 0.002015713370}, + {958, 0.001937577859}, + {986, 0.001861007519}, + {1015, 0.001786067833}, + {1046, 0.001712817241}, + {1077, 0.001641307199}, + {1108, 0.001571582281}, + {1141, 0.001503680307}, + {1175, 0.001437632500}, + {1210, 0.001373463675}, + {1246, 0.001311192449}, + {1283, 0.001250831469}, + {1321, 0.001192387668}, + {1360, 0.001135862530}, + {1401, 0.001081252371}, + {1442, 0.001028548629}, + {1485, 0.000977738166}, + {1529, 0.000928803576}, + {1574, 0.000881723491}, + {1621, 0.000836472893}, + {1669, 0.000793023425}, + {1719, 0.000751343699}, + {1770, 0.000711399598}, + {1822, 0.000673154572}, + {1876, 0.000636569930}, + {1932, 0.000601605117}, + {1989, 0.000568217987}, + {2048, 0.000536365060}, + {2109, 0.000506001772}, + {2172, 0.000477082709}, + {2236, 0.000449561826}, + {2303, 0.000423392663}, + {2371, 0.000398528532}, + {2441, 0.000374922704}, + {2514, 0.000352528575}, + {2588, 0.000331299820}, + {2665, 0.000311190536}, + {2744, 0.000292155366}, + {2826, 0.000274149618}, + {2909, 0.000257129365}, + {2996, 0.000241051539}, + {3085, 0.000225874007}, + {3176, 0.000211555641}, + {3270, 0.000198056378}, + {3367, 0.000185337267}, + {3467, 0.000173360508}, + {3570, 0.000162089486}, + {3676, 0.000151488792}, + {3785, 0.000141524239}, + {3897, 0.000132162872}, + {4013, 0.000123372971}, + {4132, 0.000115124051}, + {4255, 0.000107386849}, + {4381, 0.000100133318}, + {4511, 0.000093336608}, + {4645, 0.000086971047}, + {4782, 0.000081012119}, + {4924, 0.000075436439}, + {5070, 0.000070221726}, + {5221, 0.000065346770}, + {5376, 0.000060791409}, + {5535, 0.000056536487}, + {5699, 0.000052563830}, + {5868, 0.000048856206}, + {6043, 0.000045397293}, + {6222, 0.000042171645}, + {6406, 0.000039164656}, + {6596, 0.000036362528}, + {6792, 0.000033752235}, + {6994, 0.000031321487}, + {7201, 0.000029058702}, + {7415, 0.000026952970}, + {7635, 0.000024994023}, + {7861, 0.000023172201}, + {8094, 0.000021478424}, + {8334, 0.000019904163}, + {8582, 0.000018441411}, + {8836, 0.000017082656}, + {9098, 0.000015820851}, + {9368, 0.000014649396}, + {9646, 0.000013562106}, + {9932, 0.000012553193}, + {10227, 0.000011617240}, + {10530, 0.000010749182}, + {10843, 0.000009944284}, + {11164, 0.000009198123}, + {11496, 0.000008506567}, + {11837, 0.000007865760}, + {12188, 0.000007272104}, + {12549, 0.000006722244}, + {12922, 0.000006213052}, + {13305, 0.000005741612}, + {13700, 0.000005305209}, + {14106, 0.000004901314}, + {14524, 0.000004527574}, + {14955, 0.000004181797}, + {15399, 0.000003861946}, + {15856, 0.000003566127}, + {16326, 0.000003292579}, + {16810, 0.000003039665}, + {17309, 0.000002805865}, + {17822, 0.000002589766}, + {18351, 0.000002390057}, + {18896, 0.000002205522}, + {19456, 0.000002035030}, + {20033, 0.000001877535}, + {20627, 0.000001732064}, + {21239, 0.000001597717}, + {21869, 0.000001473659}, + {22518, 0.000001359114}, + {23186, 0.000001253366}, + {23874, 0.000001155751}, + {24582, 0.000001065652}, + {25311, 0.000000982501}, + {26062, 0.000000905769}, + {26835, 0.000000834968}, + {27631, 0.000000769645}, + {28451, 0.000000709384}, + {29295, 0.000000653796}, + {30164, 0.000000602525}, + {31059, 0.000000555239}, + {31980, 0.000000511631}, + {32929, 0.000000471420}, + {33905, 0.000000434344}, + {34911, 0.000000400160}, + {35947, 0.000000368646}, + {37013, 0.000000339596}, + {38111, 0.000000312819}, + {39242, 0.000000288138}, + {40406, 0.000000265391}, + {41604, 0.000000244428}, + {42838, 0.000000225110}, + {44109, 0.000000207309}, + {45418, 0.000000190908}, + {46765, 0.000000175797}, + {48152, 0.000000161875}, + {49580, 0.000000149049}, + {51051, 0.000000137234}, + {52565, 0.000000126351}, + {54125, 0.000000116326}, + {55730, 0.000000107093}, + {57383, 0.000000098590}, + {59086, 0.000000090758}, + {60838, 0.000000083546}, + {62643, 0.000000076904}, + {64501, 0.000000070788}, + {66415, 0.000000065157}, + {68385, 0.000000059971}, + {70413, 0.000000055197}, + {72502, 0.000000050802}, + {74653, 0.000000046755}, + {76867, 0.000000043029}, + {79147, 0.000000039599}, + {81495, 0.000000036442}, + {83912, 0.000000033536}, + {86401, 0.000000030860}, + {88964, 0.000000028398}, + {91603, 0.000000026131}, + {94321, 0.000000024045}, + {97119, 0.000000022125}, + {100000, 0.000000254317}, + {0, 0}, }; -dist_point_gen::cdf_point w2[] = { - {8, 0.170107627909139}, - {32, 0.189008475454599}, - {34, 0.202751220734426}, - {36, 0.216493966014254}, - {38, 0.230236711294082}, - {40, 0.24397945657391}, - {43, 0.257722201853738}, - {46, 0.271464947133566}, - {49, 0.285207692413394}, - {53, 0.298950437693222}, - {58, 0.31269318297305}, - {64, 0.326435928252878}, - {67, 0.327962652787335}, - {71, 0.329489377321793}, - {75, 0.331016101856251}, - {80, 0.332542826390709}, - {85, 0.334069550925166}, - {91, 0.335596275459624}, - {98, 0.337122999994082}, - {107, 0.338649724528539}, - {116, 0.340176449062997}, - {128, 0.341703173597455}, - {135, 0.353916810301949}, - {142, 0.366130447006443}, - {151, 0.378344083710937}, - {160, 0.390557720415431}, - {171, 0.402771357119925}, - {183, 0.414984993824419}, - {197, 0.427198630528913}, - {213, 0.439412267233407}, - {233, 0.451625903937901}, - {256, 0.463839540642395}, - {269, 0.508495648577945}, - {284, 0.553151756513495}, - {301, 0.597807864449045}, - {320, 0.642463972384595}, - {341, 0.687120080320145}, - {366, 0.731776188255695}, - {394, 0.776432296191245}, - {427, 0.821088404126795}, - {465, 0.865744512062345}, - {512, 0.910400619997895}, - {539, 0.915171571649521}, - {569, 0.919942523301147}, - {602, 0.924713474952773}, - {640, 0.929484426604399}, - {683, 0.934255378256025}, - {731, 0.939026329907651}, - {788, 0.943797281559276}, - {853, 0.948568233210902}, - {931, 0.953339184862528}, - {1024, 0.958110136514154}, - {1078, 0.960400194140515}, - {1138, 0.962690251766876}, - {1205, 0.964980309393238}, - {1280, 0.967270367019599}, - {1365, 0.96956042464596}, - {1463, 0.971850482272321}, - {1575, 0.974140539898682}, - {1707, 0.976430597525043}, - {1862, 0.978720655151404}, - {2048, 0.981010712777766}, - {2156, 0.982441997677838}, - {2276, 0.983873282577911}, - {2409, 0.985304567477984}, - {2560, 0.986735852378057}, - {2731, 0.98816713727813}, - {2926, 0.989598422178203}, - {3151, 0.991029707078276}, - {3413, 0.992460991978349}, - {3724, 0.993892276878422}, - {4096, 0.995323561778495}, - {4312, 0.995633673903787}, - {4551, 0.99594378602908}, - {4819, 0.996253898154372}, - {5120, 0.996564010279664}, - {5461, 0.996874122404957}, - {5851, 0.997184234530249}, - {6302, 0.997494346655541}, - {6827, 0.997804458780834}, - {7447, 0.998114570906126}, - {8192, 0.998424683031419}, - {8623, 0.998472392503279}, - {9102, 0.998520101975139}, - {9638, 0.998567811446999}, - {10240, 0.998615520918859}, - {10923, 0.998663230390719}, - {11703, 0.998710939862579}, - {12603, 0.99875864933444}, - {13653, 0.9988063588063}, - {14895, 0.99885406827816}, - {16384, 0.99890177775002}, - {17246, 0.99892563248595}, - {18204, 0.99894948722188}, - {19275, 0.99897334195781}, - {20480, 0.99899719669374}, - {21845, 0.99902105142967}, - {23406, 0.9990449061656}, - {25206, 0.999068760901531}, - {27307, 0.999092615637461}, - {29789, 0.999116470373391}, - {32768, 0.999140325109321}, - {34493, 0.999161198026518}, - {36409, 0.999182070943715}, - {38551, 0.999202943860912}, - {40960, 0.99922381677811}, - {43691, 0.999244689695307}, - {46811, 0.999265562612504}, - {50412, 0.999286435529701}, - {54613, 0.999307308446898}, - {59578, 0.999328181364096}, - {65536, 0.999349054281293}, - {68985, 0.999401236574286}, - {72818, 0.999453418867279}, - {77101, 0.999505601160272}, - {81920, 0.999557783453265}, - {87381, 0.999609965746258}, - {93623, 0.999662148039251}, - {100825, 0.999714330332244}, - {109227, 0.999766512625237}, - {119156, 0.99981869491823}, - {131072, 0.999870877211223}, - {137971, 0.999881313669821}, - {145636, 0.99989175012842}, - {154202, 0.999902186587019}, - {163840, 0.999912623045617}, - {174763, 0.999923059504216}, - {187246, 0.999933495962815}, - {201649, 0.999943932421413}, - {218453, 0.999954368880012}, - {238313, 0.99996480533861}, - {262144, 0.999975241797209}, - {275941, 0.999975987263521}, - {291271, 0.999976732729834}, - {308405, 0.999977478196146}, - {327680, 0.999978223662459}, - {349525, 0.999978969128771}, - {374491, 0.999979714595084}, - {403298, 0.999980460061396}, - {436907, 0.999981205527708}, - {476625, 0.999981950994021}, - {524288, 0.999982696460333}, - {573085, 0.9999844268143}, - {631896, 0.999986157168266}, - {704160, 0.999987887522233}, - {795085, 0.9999896178762}, - {912974, 0.999991348230167}, - {1071908, 0.999993078584133}, - {1297841, 0.9999948089381}, - {1644453, 0.999996539292067}, - {2243665, 0.999998269646033}, - {3529904, 1}, +dist_point_gen::weight dist_point_gen::w2[] = { + {8, 0.170107627909}, + {32, 0.018900847545}, + {34, 0.013742745280}, + {36, 0.013742745280}, + {38, 0.013742745280}, + {40, 0.013742745280}, + {43, 0.013742745280}, + {46, 0.013742745280}, + {49, 0.013742745280}, + {53, 0.013742745280}, + {58, 0.013742745280}, + {64, 0.013742745280}, + {67, 0.001526724534}, + {71, 0.001526724534}, + {75, 0.001526724534}, + {80, 0.001526724534}, + {85, 0.001526724534}, + {91, 0.001526724534}, + {98, 0.001526724534}, + {107, 0.001526724534}, + {116, 0.001526724534}, + {128, 0.001526724534}, + {135, 0.012213636704}, + {142, 0.012213636704}, + {151, 0.012213636704}, + {160, 0.012213636704}, + {171, 0.012213636704}, + {183, 0.012213636704}, + {197, 0.012213636704}, + {213, 0.012213636704}, + {233, 0.012213636704}, + {256, 0.012213636704}, + {269, 0.044656107936}, + {284, 0.044656107936}, + {301, 0.044656107936}, + {320, 0.044656107936}, + {341, 0.044656107936}, + {366, 0.044656107936}, + {394, 0.044656107936}, + {427, 0.044656107936}, + {465, 0.044656107936}, + {512, 0.044656107936}, + {539, 0.004770951652}, + {569, 0.004770951652}, + {602, 0.004770951652}, + {640, 0.004770951652}, + {683, 0.004770951652}, + {731, 0.004770951652}, + {788, 0.004770951652}, + {853, 0.004770951652}, + {931, 0.004770951652}, + {1024, 0.004770951652}, + {1078, 0.002290057626}, + {1138, 0.002290057626}, + {1205, 0.002290057626}, + {1280, 0.002290057626}, + {1365, 0.002290057626}, + {1463, 0.002290057626}, + {1575, 0.002290057626}, + {1707, 0.002290057626}, + {1862, 0.002290057626}, + {2048, 0.002290057626}, + {2156, 0.001431284900}, + {2276, 0.001431284900}, + {2409, 0.001431284900}, + {2560, 0.001431284900}, + {2731, 0.001431284900}, + {2926, 0.001431284900}, + {3151, 0.001431284900}, + {3413, 0.001431284900}, + {3724, 0.001431284900}, + {4096, 0.001431284900}, + {4312, 0.000310112125}, + {4551, 0.000310112125}, + {4819, 0.000310112125}, + {5120, 0.000310112125}, + {5461, 0.000310112125}, + {5851, 0.000310112125}, + {6302, 0.000310112125}, + {6827, 0.000310112125}, + {7447, 0.000310112125}, + {8192, 0.000310112125}, + {8623, 0.000047709472}, + {9102, 0.000047709472}, + {9638, 0.000047709472}, + {10240, 0.000047709472}, + {10923, 0.000047709472}, + {11703, 0.000047709472}, + {12603, 0.000047709472}, + {13653, 0.000047709472}, + {14895, 0.000047709472}, + {16384, 0.000047709472}, + {17246, 0.000023854736}, + {18204, 0.000023854736}, + {19275, 0.000023854736}, + {20480, 0.000023854736}, + {21845, 0.000023854736}, + {23406, 0.000023854736}, + {25206, 0.000023854736}, + {27307, 0.000023854736}, + {29789, 0.000023854736}, + {32768, 0.000023854736}, + {34493, 0.000020872917}, + {36409, 0.000020872917}, + {38551, 0.000020872917}, + {40960, 0.000020872917}, + {43691, 0.000020872917}, + {46811, 0.000020872917}, + {50412, 0.000020872917}, + {54613, 0.000020872917}, + {59578, 0.000020872917}, + {65536, 0.000020872917}, + {68985, 0.000052182293}, + {72818, 0.000052182293}, + {77101, 0.000052182293}, + {81920, 0.000052182293}, + {87381, 0.000052182293}, + {93623, 0.000052182293}, + {100825, 0.000052182293}, + {109227, 0.000052182293}, + {119156, 0.000052182293}, + {131072, 0.000052182293}, + {137971, 0.000010436459}, + {145636, 0.000010436459}, + {154202, 0.000010436459}, + {163840, 0.000010436459}, + {174763, 0.000010436459}, + {187246, 0.000010436459}, + {201649, 0.000010436459}, + {218453, 0.000010436459}, + {238313, 0.000010436459}, + {262144, 0.000010436459}, + {275941, 0.000000745466}, + {291271, 0.000000745466}, + {308405, 0.000000745466}, + {327680, 0.000000745466}, + {349525, 0.000000745466}, + {374491, 0.000000745466}, + {403298, 0.000000745466}, + {436907, 0.000000745466}, + {476625, 0.000000745466}, + {524288, 0.000000745466}, + {573085, 0.000001730354}, + {631896, 0.000001730354}, + {704160, 0.000001730354}, + {795085, 0.000001730354}, + {912974, 0.000001730354}, + {1071908, 0.000001730354}, + {1297841, 0.000001730354}, + {1644453, 0.000001730354}, + {2243665, 0.000001730354}, + {3529904, 0.000001730354}, + {0, 0}, }; -dist_point_gen::cdf_point w3[] = { - {8, 0.0648826230027598}, - {32, 0.0973239345041398}, - {36, 0.108913294911834}, - {40, 0.120502655319528}, - {46, 0.132092015727223}, - {53, 0.143681376134917}, - {64, 0.155270736542611}, - {70, 0.18798780420397}, - {77, 0.220704871865328}, - {85, 0.253421939526687}, - {96, 0.286139007188045}, - {110, 0.318856074849404}, - {128, 0.351573142510763}, - {137, 0.369864095116246}, - {146, 0.388155047721729}, - {158, 0.406446000327212}, - {171, 0.424736952932695}, - {186, 0.443027905538178}, - {205, 0.461318858143661}, - {228, 0.479609810749145}, - {256, 0.497900763354628}, - {268, 0.523993534878443}, - {282, 0.550086306402258}, - {296, 0.576179077926073}, - {313, 0.602271849449888}, - {331, 0.628364620973703}, - {352, 0.654457392497518}, - {375, 0.680550164021333}, - {402, 0.706642935545148}, - {433, 0.732735707068963}, - {469, 0.758828478592778}, - {512, 0.784921250116593}, - {531, 0.790512558564584}, - {551, 0.796103867012575}, - {573, 0.801695175460566}, - {597, 0.807286483908557}, - {623, 0.812877792356548}, - {652, 0.818469100804539}, - {683, 0.82406040925253}, - {717, 0.829651717700521}, - {755, 0.835243026148512}, - {796, 0.840834334596503}, - {843, 0.846425643044494}, - {896, 0.852016951492485}, - {956, 0.857608259940476}, - {1024, 0.863199568388467}, - {1053, 0.865736365928318}, - {1084, 0.86827316346817}, - {1117, 0.870809961008021}, - {1152, 0.873346758547872}, - {1189, 0.875883556087723}, - {1229, 0.878420353627574}, - {1271, 0.880957151167425}, - {1317, 0.883493948707276}, - {1365, 0.886030746247128}, - {1418, 0.888567543786979}, - {1475, 0.89110434132683}, - {1536, 0.893641138866681}, - {1603, 0.896177936406532}, - {1676, 0.898714733946383}, - {1755, 0.901251531486234}, - {1843, 0.903788329026086}, - {1940, 0.906325126565937}, - {2048, 0.908861924105788}, - {2092, 0.909949120991973}, - {2137, 0.911036317878158}, - {2185, 0.912123514764344}, - {2234, 0.913210711650529}, - {2286, 0.914297908536714}, - {2341, 0.9153851054229}, - {2398, 0.916472302309085}, - {2458, 0.91755949919527}, - {2521, 0.918646696081455}, - {2587, 0.919733892967641}, - {2657, 0.920821089853826}, - {2731, 0.921908286740011}, - {2809, 0.922995483626197}, - {2891, 0.924082680512382}, - {2979, 0.925169877398567}, - {3072, 0.926257074284752}, - {3171, 0.927344271170938}, - {3277, 0.928431468057123}, - {3390, 0.929518664943308}, - {3511, 0.930605861829494}, - {3641, 0.931693058715679}, - {3781, 0.932780255601864}, - {3932, 0.933867452488049}, - {4096, 0.934954649374235}, - {4163, 0.935901565991808}, - {4233, 0.936848482609381}, - {4304, 0.937795399226954}, - {4378, 0.938742315844527}, - {4455, 0.9396892324621}, - {4535, 0.940636149079674}, - {4617, 0.941583065697247}, - {4703, 0.94252998231482}, - {4792, 0.943476898932393}, - {4884, 0.944423815549966}, - {4979, 0.945370732167539}, - {5079, 0.946317648785112}, - {5183, 0.947264565402686}, - {5291, 0.948211482020259}, - {5403, 0.949158398637832}, - {5521, 0.950105315255405}, - {5643, 0.951052231872978}, - {5772, 0.951999148490551}, - {5906, 0.952946065108124}, - {6046, 0.953892981725698}, - {6194, 0.954839898343271}, - {6349, 0.955786814960844}, - {6512, 0.956733731578417}, - {6683, 0.95768064819599}, - {6864, 0.958627564813563}, - {7054, 0.959574481431136}, - {7256, 0.96052139804871}, - {7469, 0.961468314666283}, - {7696, 0.962415231283856}, - {7936, 0.963362147901429}, - {8192, 0.964309064519002}, - {8293, 0.964806259070139}, - {8397, 0.965303453621275}, - {8503, 0.965800648172412}, - {8612, 0.966297842723548}, - {8724, 0.966795037274685}, - {8839, 0.967292231825821}, - {8957, 0.967789426376958}, - {9078, 0.968286620928094}, - {9202, 0.968783815479231}, - {9330, 0.969281010030367}, - {9461, 0.969778204581504}, - {9596, 0.97027539913264}, - {9735, 0.970772593683777}, - {9879, 0.971269788234913}, - {10026, 0.97176698278605}, - {10178, 0.972264177337186}, - {10335, 0.972761371888322}, - {10496, 0.973258566439459}, - {10663, 0.973755760990595}, - {10835, 0.974252955541732}, - {11012, 0.974750150092868}, - {11196, 0.975247344644005}, - {11385, 0.975744539195141}, - {11582, 0.976241733746278}, - {11785, 0.976738928297414}, - {11995, 0.977236122848551}, - {12214, 0.977733317399687}, - {12440, 0.978230511950824}, - {12674, 0.97872770650196}, - {12918, 0.979224901053097}, - {13171, 0.979722095604233}, - {13435, 0.98021929015537}, - {13709, 0.980716484706506}, - {13995, 0.981213679257643}, - {14292, 0.981710873808779}, - {14603, 0.982208068359916}, - {14928, 0.982705262911052}, - {15267, 0.983202457462189}, - {15622, 0.983699652013325}, - {15994, 0.984196846564462}, - {16384, 0.984694041115598}, - {16540, 0.984801735350781}, - {16699, 0.984909429585963}, - {16861, 0.985017123821145}, - {17027, 0.985124818056328}, - {17195, 0.98523251229151}, - {17367, 0.985340206526693}, - {17542, 0.985447900761875}, - {17721, 0.985555594997057}, - {17904, 0.98566328923224}, - {18091, 0.985770983467422}, - {18281, 0.985878677702604}, - {18476, 0.985986371937787}, - {18674, 0.986094066172969}, - {18877, 0.986201760408151}, - {19085, 0.986309454643334}, - {19297, 0.986417148878516}, - {19514, 0.986524843113698}, - {19735, 0.986632537348881}, - {19962, 0.986740231584063}, - {20194, 0.986847925819246}, - {20432, 0.986955620054428}, - {20675, 0.98706331428961}, - {20924, 0.987171008524793}, - {21179, 0.987278702759975}, - {21441, 0.987386396995157}, - {21709, 0.98749409123034}, - {21984, 0.987601785465522}, - {22265, 0.987709479700704}, - {22555, 0.987817173935887}, - {22851, 0.987924868171069}, - {23156, 0.988032562406252}, - {23469, 0.988140256641434}, - {23790, 0.988247950876616}, - {24121, 0.988355645111799}, - {24461, 0.988463339346981}, - {24810, 0.988571033582163}, - {25170, 0.988678727817346}, - {25540, 0.988786422052528}, - {25921, 0.98889411628771}, - {26314, 0.989001810522893}, - {26719, 0.989109504758075}, - {27136, 0.989217198993258}, - {27567, 0.98932489322844}, - {28011, 0.989432587463622}, - {28471, 0.989540281698805}, - {28945, 0.989647975933987}, - {29436, 0.989755670169169}, - {29943, 0.989863364404352}, - {30468, 0.989971058639534}, - {31013, 0.990078752874716}, - {31576, 0.990186447109899}, - {32161, 0.990294141345081}, - {32768, 0.990401835580264}, - {33007, 0.990443196317542}, - {33250, 0.990484557054821}, - {33496, 0.990525917792099}, - {33746, 0.990567278529378}, - {34000, 0.990608639266656}, - {34257, 0.990650000003935}, - {34519, 0.990691360741213}, - {34784, 0.990732721478492}, - {35054, 0.99077408221577}, - {35328, 0.990815442953049}, - {35606, 0.990856803690327}, - {35889, 0.990898164427606}, - {36176, 0.990939525164885}, - {36468, 0.990980885902163}, - {36764, 0.991022246639442}, - {37065, 0.99106360737672}, - {37372, 0.991104968113999}, - {37683, 0.991146328851277}, - {38000, 0.991187689588556}, - {38322, 0.991229050325834}, - {38649, 0.991270411063113}, - {38983, 0.991311771800391}, - {39322, 0.99135313253767}, - {39667, 0.991394493274948}, - {40018, 0.991435854012227}, - {40375, 0.991477214749506}, - {40739, 0.991518575486784}, - {41109, 0.991559936224063}, - {41486, 0.991601296961341}, - {41870, 0.99164265769862}, - {42262, 0.991684018435898}, - {42660, 0.991725379173177}, - {43067, 0.991766739910455}, - {43481, 0.991808100647734}, - {43903, 0.991849461385012}, - {44333, 0.991890822122291}, - {44772, 0.99193218285957}, - {45220, 0.991973543596848}, - {45677, 0.992014904334127}, - {46143, 0.992056265071405}, - {46618, 0.992097625808684}, - {47104, 0.992138986545962}, - {47600, 0.992180347283241}, - {48106, 0.992221708020519}, - {48623, 0.992263068757798}, - {49152, 0.992304429495076}, - {49692, 0.992345790232355}, - {50244, 0.992387150969633}, - {50809, 0.992428511706912}, - {51386, 0.992469872444191}, - {51977, 0.992511233181469}, - {52581, 0.992552593918748}, - {53200, 0.992593954656026}, - {53833, 0.992635315393305}, - {54482, 0.992676676130583}, - {55146, 0.992718036867862}, - {55827, 0.99275939760514}, - {56525, 0.992800758342419}, - {57240, 0.992842119079697}, - {57974, 0.992883479816976}, - {58727, 0.992924840554255}, - {59500, 0.992966201291533}, - {60293, 0.993007562028812}, - {61108, 0.99304892276609}, - {61945, 0.993090283503369}, - {62805, 0.993131644240647}, - {63690, 0.993173004977926}, - {64600, 0.993214365715204}, - {65536, 0.993255726452483}, - {65902, 0.993298761415796}, - {66272, 0.993341796379109}, - {66647, 0.993384831342422}, - {67025, 0.993427866305735}, - {67408, 0.993470901269048}, - {67796, 0.993513936232361}, - {68188, 0.993556971195674}, - {68584, 0.993600006158987}, - {68985, 0.9936430411223}, - {69391, 0.993686076085613}, - {69802, 0.993729111048926}, - {70217, 0.993772146012239}, - {70638, 0.993815180975552}, - {71063, 0.993858215938865}, - {71494, 0.993901250902179}, - {71930, 0.993944285865492}, - {72371, 0.993987320828805}, - {72818, 0.994030355792118}, - {73270, 0.994073390755431}, - {73728, 0.994116425718744}, - {74192, 0.994159460682057}, - {74661, 0.99420249564537}, - {75137, 0.994245530608683}, - {75618, 0.994288565571996}, - {76106, 0.994331600535309}, - {76601, 0.994374635498622}, - {77101, 0.994417670461935}, - {77608, 0.994460705425248}, - {78122, 0.994503740388561}, - {78643, 0.994546775351874}, - {79171, 0.994589810315187}, - {79706, 0.9946328452785}, - {80248, 0.994675880241813}, - {80798, 0.994718915205126}, - {81355, 0.994761950168439}, - {81920, 0.994804985131752}, - {82493, 0.994848020095065}, - {83074, 0.994891055058378}, - {83663, 0.994934090021691}, - {84261, 0.994977124985004}, - {84867, 0.995020159948317}, - {85482, 0.995063194911631}, - {86106, 0.995106229874944}, - {86739, 0.995149264838257}, - {87381, 0.99519229980157}, - {88033, 0.995235334764883}, - {88695, 0.995278369728196}, - {89367, 0.995321404691509}, - {90049, 0.995364439654822}, - {90742, 0.995407474618135}, - {91446, 0.995450509581448}, - {92160, 0.995493544544761}, - {92886, 0.995536579508074}, - {93623, 0.995579614471387}, - {94372, 0.9956226494347}, - {95133, 0.995665684398013}, - {95906, 0.995708719361326}, - {96692, 0.995751754324639}, - {97492, 0.995794789287952}, - {98304, 0.995837824251265}, - {99130, 0.995880859214578}, - {99970, 0.995923894177891}, - {100825, 0.995966929141204}, - {101694, 0.996009964104517}, - {102578, 0.996052999067831}, - {103478, 0.996096034031144}, - {104394, 0.996139068994457}, - {105326, 0.99618210395777}, - {106275, 0.996225138921083}, - {107241, 0.996268173884396}, - {108225, 0.996311208847709}, - {109227, 0.996354243811022}, - {110247, 0.996397278774335}, - {111288, 0.996440313737648}, - {112347, 0.996483348700961}, - {113428, 0.996526383664274}, - {114529, 0.996569418627587}, - {115652, 0.9966124535909}, - {116797, 0.996655488554213}, - {117965, 0.996698523517526}, - {119156, 0.996741558480839}, - {120372, 0.996784593444152}, - {121613, 0.996827628407465}, - {122880, 0.996870663370778}, - {124173, 0.996913698334091}, - {125494, 0.996956733297404}, - {126844, 0.996999768260717}, - {128223, 0.99704280322403}, - {129632, 0.997085838187344}, - {131072, 0.997128873150657}, - {131630, 0.997140095435483}, - {132192, 0.997151317720308}, - {132760, 0.997162540005134}, - {133332, 0.99717376228996}, - {133909, 0.997184984574786}, - {134491, 0.997196206859612}, - {135079, 0.997207429144438}, - {135671, 0.997218651429264}, - {136269, 0.99722987371409}, - {136872, 0.997241095998916}, - {137480, 0.997252318283742}, - {138094, 0.997263540568568}, - {138713, 0.997274762853394}, - {139338, 0.99728598513822}, - {139968, 0.997297207423046}, - {140605, 0.997308429707872}, - {141247, 0.997319651992698}, - {141894, 0.997330874277524}, - {142548, 0.997342096562349}, - {143208, 0.997353318847175}, - {143874, 0.997364541132001}, - {144547, 0.997375763416827}, - {145225, 0.997386985701653}, - {145910, 0.997398207986479}, - {146602, 0.997409430271305}, - {147300, 0.997420652556131}, - {148005, 0.997431874840957}, - {148716, 0.997443097125783}, - {149435, 0.997454319410609}, - {150160, 0.997465541695435}, - {150893, 0.997476763980261}, - {151632, 0.997487986265087}, - {152379, 0.997499208549913}, - {153134, 0.997510430834739}, - {153895, 0.997521653119564}, - {154665, 0.99753287540439}, - {155442, 0.997544097689216}, - {156227, 0.997555319974042}, - {157020, 0.997566542258868}, - {157821, 0.997577764543694}, - {158631, 0.99758898682852}, - {159448, 0.997600209113346}, - {160275, 0.997611431398172}, - {161109, 0.997622653682998}, - {161953, 0.997633875967824}, - {162805, 0.99764509825265}, - {163667, 0.997656320537476}, - {164537, 0.997667542822302}, - {165417, 0.997678765107128}, - {166306, 0.997689987391954}, - {167205, 0.997701209676779}, - {168114, 0.997712431961605}, - {169033, 0.997723654246431}, - {169961, 0.997734876531257}, - {170901, 0.997746098816083}, - {171850, 0.997757321100909}, - {172810, 0.997768543385735}, - {173781, 0.997779765670561}, - {174763, 0.997790987955387}, - {175756, 0.997802210240213}, - {176760, 0.997813432525039}, - {177776, 0.997824654809865}, - {178803, 0.997835877094691}, - {179843, 0.997847099379517}, - {180895, 0.997858321664343}, - {181959, 0.997869543949169}, - {183035, 0.997880766233995}, - {184125, 0.99789198851882}, - {185227, 0.997903210803646}, - {186343, 0.997914433088472}, - {187473, 0.997925655373298}, - {188616, 0.997936877658124}, - {189773, 0.99794809994295}, - {190944, 0.997959322227776}, - {192130, 0.997970544512602}, - {193331, 0.997981766797428}, - {194547, 0.997992989082254}, - {195778, 0.99800421136708}, - {197025, 0.998015433651906}, - {198288, 0.998026655936732}, - {199568, 0.998037878221558}, - {200864, 0.998049100506384}, - {202176, 0.99806032279121}, - {203507, 0.998071545076035}, - {204854, 0.998082767360861}, - {206220, 0.998093989645687}, - {207604, 0.998105211930513}, - {209007, 0.998116434215339}, - {210429, 0.998127656500165}, - {211870, 0.998138878784991}, - {213331, 0.998150101069817}, - {214812, 0.998161323354643}, - {216315, 0.998172545639469}, - {217838, 0.998183767924295}, - {219383, 0.998194990209121}, - {220950, 0.998206212493947}, - {222540, 0.998217434778773}, - {224152, 0.998228657063599}, - {225788, 0.998239879348425}, - {227448, 0.99825110163325}, - {229133, 0.998262323918076}, - {230843, 0.998273546202902}, - {232579, 0.998284768487728}, - {234341, 0.998295990772554}, - {236130, 0.99830721305738}, - {237946, 0.998318435342206}, - {239791, 0.998329657627032}, - {241664, 0.998340879911858}, - {243567, 0.998352102196684}, - {245500, 0.99836332448151}, - {247464, 0.998374546766336}, - {249460, 0.998385769051162}, - {251488, 0.998396991335988}, - {253549, 0.998408213620814}, - {255645, 0.99841943590564}, - {257775, 0.998430658190466}, - {259941, 0.998441880475291}, - {262144, 0.998453102760117}, - {263003, 0.998459933656693}, - {263869, 0.998466764553268}, - {264739, 0.998473595449844}, - {265616, 0.998480426346419}, - {266499, 0.998487257242994}, - {267387, 0.99849408813957}, - {268281, 0.998500919036145}, - {269181, 0.998507749932721}, - {270088, 0.998514580829296}, - {271000, 0.998521411725872}, - {271919, 0.998528242622447}, - {272844, 0.998535073519022}, - {273775, 0.998541904415598}, - {274713, 0.998548735312173}, - {275657, 0.998555566208749}, - {276607, 0.998562397105324}, - {277564, 0.998569228001899}, - {278528, 0.998576058898475}, - {279498, 0.99858288979505}, - {280476, 0.998589720691626}, - {281460, 0.998596551588201}, - {282451, 0.998603382484777}, - {283449, 0.998610213381352}, - {284454, 0.998617044277927}, - {285466, 0.998623875174503}, - {286486, 0.998630706071078}, - {287513, 0.998637536967654}, - {288547, 0.998644367864229}, - {289589, 0.998651198760804}, - {290638, 0.99865802965738}, - {291695, 0.998664860553955}, - {292759, 0.998671691450531}, - {293832, 0.998678522347106}, - {294912, 0.998685353243681}, - {296000, 0.998692184140257}, - {297097, 0.998699015036832}, - {298201, 0.998705845933408}, - {299314, 0.998712676829983}, - {300435, 0.998719507726558}, - {301564, 0.998726338623134}, - {302702, 0.998733169519709}, - {303849, 0.998740000416285}, - {305004, 0.99874683131286}, - {306168, 0.998753662209436}, - {307341, 0.998760493106011}, - {308523, 0.998767324002586}, - {309715, 0.998774154899162}, - {310915, 0.998780985795737}, - {312125, 0.998787816692313}, - {313344, 0.998794647588888}, - {314573, 0.998801478485463}, - {315811, 0.998808309382039}, - {317060, 0.998815140278614}, - {318318, 0.99882197117519}, - {319586, 0.998828802071765}, - {320864, 0.99883563296834}, - {322153, 0.998842463864916}, - {323452, 0.998849294761491}, - {324761, 0.998856125658067}, - {326082, 0.998862956554642}, - {327413, 0.998869787451218}, - {328754, 0.998876618347793}, - {330107, 0.998883449244368}, - {331471, 0.998890280140944}, - {332847, 0.998897111037519}, - {334234, 0.998903941934095}, - {335632, 0.99891077283067}, - {337042, 0.998917603727245}, - {338464, 0.998924434623821}, - {339899, 0.998931265520396}, - {341345, 0.998938096416972}, - {342804, 0.998944927313547}, - {344275, 0.998951758210122}, - {345759, 0.998958589106698}, - {347256, 0.998965420003273}, - {348765, 0.998972250899849}, - {350288, 0.998979081796424}, - {351825, 0.998985912693}, - {353375, 0.998992743589575}, - {354938, 0.99899957448615}, - {356516, 0.999006405382726}, - {358107, 0.999013236279301}, - {359713, 0.999020067175877}, - {361334, 0.999026898072452}, - {362969, 0.999033728969027}, - {364618, 0.999040559865603}, - {366283, 0.999047390762178}, - {367964, 0.999054221658754}, - {369659, 0.999061052555329}, - {371371, 0.999067883451904}, - {373098, 0.99907471434848}, - {374841, 0.999081545245055}, - {376601, 0.999088376141631}, - {378378, 0.999095207038206}, - {380171, 0.999102037934781}, - {381981, 0.999108868831357}, - {383809, 0.999115699727932}, - {385654, 0.999122530624508}, - {387517, 0.999129361521083}, - {389398, 0.999136192417658}, - {391298, 0.999143023314234}, - {393216, 0.999149854210809}, - {395153, 0.999156685107385}, - {397109, 0.99916351600396}, - {399085, 0.999170346900536}, - {401080, 0.999177177797111}, - {403096, 0.999184008693686}, - {405132, 0.999190839590262}, - {407188, 0.999197670486837}, - {409266, 0.999204501383413}, - {411364, 0.999211332279988}, - {413485, 0.999218163176564}, - {415627, 0.999224994073139}, - {417792, 0.999231824969714}, - {419979, 0.99923865586629}, - {422190, 0.999245486762865}, - {424424, 0.999252317659441}, - {426681, 0.999259148556016}, - {428963, 0.999265979452591}, - {431269, 0.999272810349167}, - {433600, 0.999279641245742}, - {435957, 0.999286472142318}, - {438339, 0.999293303038893}, - {440748, 0.999300133935468}, - {443183, 0.999306964832044}, - {445645, 0.999313795728619}, - {448134, 0.999320626625195}, - {450652, 0.99932745752177}, - {453198, 0.999334288418345}, - {455773, 0.999341119314921}, - {458378, 0.999347950211496}, - {461012, 0.999354781108072}, - {463677, 0.999361612004647}, - {466372, 0.999368442901223}, - {469100, 0.999375273797798}, - {471859, 0.999382104694373}, - {474651, 0.999388935590949}, - {477477, 0.999395766487524}, - {480336, 0.9994025973841}, - {483229, 0.999409428280675}, - {486158, 0.99941625917725}, - {489122, 0.999423090073826}, - {492123, 0.999429920970401}, - {495161, 0.999436751866977}, - {498236, 0.999443582763552}, - {501350, 0.999450413660127}, - {504504, 0.999457244556703}, - {507697, 0.999464075453278}, - {510930, 0.999470906349854}, - {514206, 0.999477737246429}, - {517523, 0.999484568143004}, - {520884, 0.99949139903958}, - {524288, 0.999498229936155}, - {526844, 0.999500751393763}, - {529425, 0.99950327285137}, - {532031, 0.999505794308977}, - {534663, 0.999508315766584}, - {537322, 0.999510837224192}, - {540006, 0.999513358681799}, - {542718, 0.999515880139406}, - {545458, 0.999518401597013}, - {548225, 0.999520923054621}, - {551020, 0.999523444512228}, - {553844, 0.999525965969835}, - {556697, 0.999528487427442}, - {559579, 0.99953100888505}, - {562492, 0.999533530342657}, - {565435, 0.999536051800264}, - {568408, 0.999538573257872}, - {571414, 0.999541094715479}, - {574451, 0.999543616173086}, - {577521, 0.999546137630693}, - {580624, 0.9995486590883}, - {583760, 0.999551180545908}, - {586931, 0.999553702003515}, - {590136, 0.999556223461122}, - {593376, 0.99955874491873}, - {596652, 0.999561266376337}, - {599964, 0.999563787833944}, - {603313, 0.999566309291551}, - {606700, 0.999568830749159}, - {610125, 0.999571352206766}, - {613590, 0.999573873664373}, - {617093, 0.99957639512198}, - {620637, 0.999578916579588}, - {624222, 0.999581438037195}, - {627848, 0.999583959494802}, - {631517, 0.999586480952409}, - {635229, 0.999589002410017}, - {638985, 0.999591523867624}, - {642786, 0.999594045325231}, - {646632, 0.999596566782838}, - {650524, 0.999599088240446}, - {654463, 0.999601609698053}, - {658451, 0.99960413115566}, - {662487, 0.999606652613268}, - {666573, 0.999609174070875}, - {670710, 0.999611695528482}, - {674899, 0.999614216986089}, - {679140, 0.999616738443697}, - {683435, 0.999619259901304}, - {687784, 0.999621781358911}, - {692190, 0.999624302816518}, - {696652, 0.999626824274126}, - {701172, 0.999629345731733}, - {705750, 0.99963186718934}, - {710390, 0.999634388646947}, - {715090, 0.999636910104555}, - {719853, 0.999639431562162}, - {724680, 0.999641953019769}, - {729573, 0.999644474477376}, - {734531, 0.999646995934984}, - {739558, 0.999649517392591}, - {744654, 0.999652038850198}, - {749820, 0.999654560307805}, - {755059, 0.999657081765413}, - {760371, 0.99965960322302}, - {765759, 0.999662124680627}, - {771224, 0.999664646138234}, - {776767, 0.999667167595842}, - {782390, 0.999669689053449}, - {788096, 0.999672210511056}, - {793885, 0.999674731968664}, - {799760, 0.999677253426271}, - {805723, 0.999679774883878}, - {811775, 0.999682296341485}, - {817919, 0.999684817799093}, - {824156, 0.9996873392567}, - {830490, 0.999689860714307}, - {836921, 0.999692382171914}, - {843453, 0.999694903629522}, - {850088, 0.999697425087129}, - {856827, 0.999699946544736}, - {863675, 0.999702468002343}, - {870633, 0.999704989459951}, - {877704, 0.999707510917558}, - {884890, 0.999710032375165}, - {892196, 0.999712553832772}, - {899623, 0.99971507529038}, - {907174, 0.999717596747987}, - {914854, 0.999720118205594}, - {922664, 0.999722639663201}, - {930609, 0.999725161120809}, - {938693, 0.999727682578416}, - {946917, 0.999730204036023}, - {955288, 0.99973272549363}, - {963807, 0.999735246951238}, - {972480, 0.999737768408845}, - {981310, 0.999740289866452}, - {990302, 0.99974281132406}, - {999461, 0.999745332781667}, - {1008790, 0.999747854239274}, - {1018296, 0.999750375696881}, - {1027982, 0.999752897154489}, - {1037854, 0.999755418612096}, - {1047917, 0.999757940069703}, - {1058178, 0.99976046152731}, - {1068642, 0.999762982984918}, - {1079314, 0.999765504442525}, - {1090202, 0.999768025900132}, - {1101312, 0.999770547357739}, - {1112651, 0.999773068815347}, - {1124225, 0.999775590272954}, - {1136043, 0.999778111730561}, - {1148112, 0.999780633188168}, - {1160440, 0.999783154645776}, - {1173036, 0.999785676103383}, - {1185908, 0.99978819756099}, - {1199066, 0.999790719018598}, - {1212519, 0.999793240476205}, - {1226277, 0.999795761933812}, - {1240351, 0.999798283391419}, - {1254752, 0.999800804849026}, - {1269492, 0.999803326306634}, - {1284581, 0.999805847764241}, - {1300034, 0.999808369221848}, - {1315863, 0.999810890679455}, - {1332082, 0.999813412137063}, - {1348706, 0.99981593359467}, - {1365751, 0.999818455052277}, - {1383231, 0.999820976509885}, - {1401165, 0.999823497967492}, - {1419570, 0.999826019425099}, - {1438465, 0.999828540882706}, - {1457870, 0.999831062340314}, - {1477805, 0.999833583797921}, - {1498294, 0.999836105255528}, - {1519358, 0.999838626713135}, - {1541023, 0.999841148170743}, - {1563315, 0.99984366962835}, - {1586261, 0.999846191085957}, - {1609891, 0.999848712543564}, - {1634235, 0.999851234001172}, - {1659327, 0.999853755458779}, - {1685202, 0.999856276916386}, - {1711896, 0.999858798373993}, - {1739450, 0.999861319831601}, - {1767905, 0.999863841289208}, - {1797307, 0.999866362746815}, - {1827703, 0.999868884204423}, - {1859145, 0.99987140566203}, - {1891687, 0.999873927119637}, - {1925389, 0.999876448577244}, - {1960314, 0.999878970034851}, - {1996529, 0.999881491492459}, - {2034108, 0.999884012950066}, - {2073128, 0.999886534407673}, - {2113675, 0.999889055865281}, - {2155839, 0.999891577322888}, - {2199720, 0.999894098780495}, - {2245424, 0.999896620238102}, - {2293067, 0.99989914169571}, - {2342777, 0.999901663153317}, - {2394689, 0.999904184610924}, - {2448954, 0.999906706068531}, - {2505735, 0.999909227526139}, - {2565212, 0.999911748983746}, - {2627581, 0.999914270441353}, - {2693059, 0.99991679189896}, - {2761883, 0.999919313356568}, - {2834317, 0.999921834814175}, - {2910653, 0.999924356271782}, - {2991214, 0.999926877729389}, - {3076362, 0.999929399186997}, - {3166500, 0.999931920644604}, - {3262080, 0.999934442102211}, - {3363608, 0.999936963559819}, - {3471660, 0.999939485017426}, - {3586885, 0.999942006475033}, - {3710020, 0.99994452793264}, - {3841911, 0.999947049390248}, - {3983524, 0.999949570847855}, - {4135977, 0.999952092305462}, - {4300563, 0.999954613763069}, - {4478791, 0.999957135220677}, - {4672430, 0.999959656678284}, - {4883570, 0.999962178135891}, - {5114695, 0.999964699593498}, - {5368784, 0.999967221051106}, - {5649438, 0.999969742508713}, - {5961053, 0.99997226396632}, - {6309051, 0.999974785423927}, - {6700199, 0.999977306881535}, - {7143054, 0.999979828339142}, - {7648594, 0.999982349796749}, - {8231141, 0.999984871254356}, - {8909743, 0.999987392711964}, - {9710291, 0.999989914169571}, - {10668901, 0.999992435627178}, - {11837511, 0.999994957084785}, - {13293619, 0.999997478542393}, - {15158197, 1.0}, +dist_point_gen::weight dist_point_gen::w3[] = { + {8, 0.064882623003}, + {32, 0.032441311501}, + {36, 0.011589360408}, + {40, 0.011589360408}, + {46, 0.011589360408}, + {53, 0.011589360408}, + {64, 0.011589360408}, + {70, 0.032717067661}, + {77, 0.032717067661}, + {85, 0.032717067661}, + {96, 0.032717067661}, + {110, 0.032717067661}, + {128, 0.032717067661}, + {137, 0.018290952605}, + {146, 0.018290952605}, + {158, 0.018290952605}, + {171, 0.018290952605}, + {186, 0.018290952605}, + {205, 0.018290952605}, + {228, 0.018290952605}, + {256, 0.018290952605}, + {268, 0.026092771524}, + {282, 0.026092771524}, + {296, 0.026092771524}, + {313, 0.026092771524}, + {331, 0.026092771524}, + {352, 0.026092771524}, + {375, 0.026092771524}, + {402, 0.026092771524}, + {433, 0.026092771524}, + {469, 0.026092771524}, + {512, 0.026092771524}, + {531, 0.005591308448}, + {551, 0.005591308448}, + {573, 0.005591308448}, + {597, 0.005591308448}, + {623, 0.005591308448}, + {652, 0.005591308448}, + {683, 0.005591308448}, + {717, 0.005591308448}, + {755, 0.005591308448}, + {796, 0.005591308448}, + {843, 0.005591308448}, + {896, 0.005591308448}, + {956, 0.005591308448}, + {1024, 0.005591308448}, + {1053, 0.002536797540}, + {1084, 0.002536797540}, + {1117, 0.002536797540}, + {1152, 0.002536797540}, + {1189, 0.002536797540}, + {1229, 0.002536797540}, + {1271, 0.002536797540}, + {1317, 0.002536797540}, + {1365, 0.002536797540}, + {1418, 0.002536797540}, + {1475, 0.002536797540}, + {1536, 0.002536797540}, + {1603, 0.002536797540}, + {1676, 0.002536797540}, + {1755, 0.002536797540}, + {1843, 0.002536797540}, + {1940, 0.002536797540}, + {2048, 0.002536797540}, + {2092, 0.001087196886}, + {2137, 0.001087196886}, + {2185, 0.001087196886}, + {2234, 0.001087196886}, + {2286, 0.001087196886}, + {2341, 0.001087196886}, + {2398, 0.001087196886}, + {2458, 0.001087196886}, + {2521, 0.001087196886}, + {2587, 0.001087196886}, + {2657, 0.001087196886}, + {2731, 0.001087196886}, + {2809, 0.001087196886}, + {2891, 0.001087196886}, + {2979, 0.001087196886}, + {3072, 0.001087196886}, + {3171, 0.001087196886}, + {3277, 0.001087196886}, + {3390, 0.001087196886}, + {3511, 0.001087196886}, + {3641, 0.001087196886}, + {3781, 0.001087196886}, + {3932, 0.001087196886}, + {4096, 0.001087196886}, + {4163, 0.000946916618}, + {4233, 0.000946916618}, + {4304, 0.000946916618}, + {4378, 0.000946916618}, + {4455, 0.000946916618}, + {4535, 0.000946916618}, + {4617, 0.000946916618}, + {4703, 0.000946916618}, + {4792, 0.000946916618}, + {4884, 0.000946916618}, + {4979, 0.000946916618}, + {5079, 0.000946916618}, + {5183, 0.000946916618}, + {5291, 0.000946916618}, + {5403, 0.000946916618}, + {5521, 0.000946916618}, + {5643, 0.000946916618}, + {5772, 0.000946916618}, + {5906, 0.000946916618}, + {6046, 0.000946916618}, + {6194, 0.000946916618}, + {6349, 0.000946916618}, + {6512, 0.000946916618}, + {6683, 0.000946916618}, + {6864, 0.000946916618}, + {7054, 0.000946916618}, + {7256, 0.000946916618}, + {7469, 0.000946916618}, + {7696, 0.000946916618}, + {7936, 0.000946916618}, + {8192, 0.000946916618}, + {8293, 0.000497194551}, + {8397, 0.000497194551}, + {8503, 0.000497194551}, + {8612, 0.000497194551}, + {8724, 0.000497194551}, + {8839, 0.000497194551}, + {8957, 0.000497194551}, + {9078, 0.000497194551}, + {9202, 0.000497194551}, + {9330, 0.000497194551}, + {9461, 0.000497194551}, + {9596, 0.000497194551}, + {9735, 0.000497194551}, + {9879, 0.000497194551}, + {10026, 0.000497194551}, + {10178, 0.000497194551}, + {10335, 0.000497194551}, + {10496, 0.000497194551}, + {10663, 0.000497194551}, + {10835, 0.000497194551}, + {11012, 0.000497194551}, + {11196, 0.000497194551}, + {11385, 0.000497194551}, + {11582, 0.000497194551}, + {11785, 0.000497194551}, + {11995, 0.000497194551}, + {12214, 0.000497194551}, + {12440, 0.000497194551}, + {12674, 0.000497194551}, + {12918, 0.000497194551}, + {13171, 0.000497194551}, + {13435, 0.000497194551}, + {13709, 0.000497194551}, + {13995, 0.000497194551}, + {14292, 0.000497194551}, + {14603, 0.000497194551}, + {14928, 0.000497194551}, + {15267, 0.000497194551}, + {15622, 0.000497194551}, + {15994, 0.000497194551}, + {16384, 0.000497194551}, + {16540, 0.000107694235}, + {16699, 0.000107694235}, + {16861, 0.000107694235}, + {17027, 0.000107694235}, + {17195, 0.000107694235}, + {17367, 0.000107694235}, + {17542, 0.000107694235}, + {17721, 0.000107694235}, + {17904, 0.000107694235}, + {18091, 0.000107694235}, + {18281, 0.000107694235}, + {18476, 0.000107694235}, + {18674, 0.000107694235}, + {18877, 0.000107694235}, + {19085, 0.000107694235}, + {19297, 0.000107694235}, + {19514, 0.000107694235}, + {19735, 0.000107694235}, + {19962, 0.000107694235}, + {20194, 0.000107694235}, + {20432, 0.000107694235}, + {20675, 0.000107694235}, + {20924, 0.000107694235}, + {21179, 0.000107694235}, + {21441, 0.000107694235}, + {21709, 0.000107694235}, + {21984, 0.000107694235}, + {22265, 0.000107694235}, + {22555, 0.000107694235}, + {22851, 0.000107694235}, + {23156, 0.000107694235}, + {23469, 0.000107694235}, + {23790, 0.000107694235}, + {24121, 0.000107694235}, + {24461, 0.000107694235}, + {24810, 0.000107694235}, + {25170, 0.000107694235}, + {25540, 0.000107694235}, + {25921, 0.000107694235}, + {26314, 0.000107694235}, + {26719, 0.000107694235}, + {27136, 0.000107694235}, + {27567, 0.000107694235}, + {28011, 0.000107694235}, + {28471, 0.000107694235}, + {28945, 0.000107694235}, + {29436, 0.000107694235}, + {29943, 0.000107694235}, + {30468, 0.000107694235}, + {31013, 0.000107694235}, + {31576, 0.000107694235}, + {32161, 0.000107694235}, + {32768, 0.000107694235}, + {33007, 0.000041360737}, + {33250, 0.000041360737}, + {33496, 0.000041360737}, + {33746, 0.000041360737}, + {34000, 0.000041360737}, + {34257, 0.000041360737}, + {34519, 0.000041360737}, + {34784, 0.000041360737}, + {35054, 0.000041360737}, + {35328, 0.000041360737}, + {35606, 0.000041360737}, + {35889, 0.000041360737}, + {36176, 0.000041360737}, + {36468, 0.000041360737}, + {36764, 0.000041360737}, + {37065, 0.000041360737}, + {37372, 0.000041360737}, + {37683, 0.000041360737}, + {38000, 0.000041360737}, + {38322, 0.000041360737}, + {38649, 0.000041360737}, + {38983, 0.000041360737}, + {39322, 0.000041360737}, + {39667, 0.000041360737}, + {40018, 0.000041360737}, + {40375, 0.000041360737}, + {40739, 0.000041360737}, + {41109, 0.000041360737}, + {41486, 0.000041360737}, + {41870, 0.000041360737}, + {42262, 0.000041360737}, + {42660, 0.000041360737}, + {43067, 0.000041360737}, + {43481, 0.000041360737}, + {43903, 0.000041360737}, + {44333, 0.000041360737}, + {44772, 0.000041360737}, + {45220, 0.000041360737}, + {45677, 0.000041360737}, + {46143, 0.000041360737}, + {46618, 0.000041360737}, + {47104, 0.000041360737}, + {47600, 0.000041360737}, + {48106, 0.000041360737}, + {48623, 0.000041360737}, + {49152, 0.000041360737}, + {49692, 0.000041360737}, + {50244, 0.000041360737}, + {50809, 0.000041360737}, + {51386, 0.000041360737}, + {51977, 0.000041360737}, + {52581, 0.000041360737}, + {53200, 0.000041360737}, + {53833, 0.000041360737}, + {54482, 0.000041360737}, + {55146, 0.000041360737}, + {55827, 0.000041360737}, + {56525, 0.000041360737}, + {57240, 0.000041360737}, + {57974, 0.000041360737}, + {58727, 0.000041360737}, + {59500, 0.000041360737}, + {60293, 0.000041360737}, + {61108, 0.000041360737}, + {61945, 0.000041360737}, + {62805, 0.000041360737}, + {63690, 0.000041360737}, + {64600, 0.000041360737}, + {65536, 0.000041360737}, + {65902, 0.000043034963}, + {66272, 0.000043034963}, + {66647, 0.000043034963}, + {67025, 0.000043034963}, + {67408, 0.000043034963}, + {67796, 0.000043034963}, + {68188, 0.000043034963}, + {68584, 0.000043034963}, + {68985, 0.000043034963}, + {69391, 0.000043034963}, + {69802, 0.000043034963}, + {70217, 0.000043034963}, + {70638, 0.000043034963}, + {71063, 0.000043034963}, + {71494, 0.000043034963}, + {71930, 0.000043034963}, + {72371, 0.000043034963}, + {72818, 0.000043034963}, + {73270, 0.000043034963}, + {73728, 0.000043034963}, + {74192, 0.000043034963}, + {74661, 0.000043034963}, + {75137, 0.000043034963}, + {75618, 0.000043034963}, + {76106, 0.000043034963}, + {76601, 0.000043034963}, + {77101, 0.000043034963}, + {77608, 0.000043034963}, + {78122, 0.000043034963}, + {78643, 0.000043034963}, + {79171, 0.000043034963}, + {79706, 0.000043034963}, + {80248, 0.000043034963}, + {80798, 0.000043034963}, + {81355, 0.000043034963}, + {81920, 0.000043034963}, + {82493, 0.000043034963}, + {83074, 0.000043034963}, + {83663, 0.000043034963}, + {84261, 0.000043034963}, + {84867, 0.000043034963}, + {85482, 0.000043034963}, + {86106, 0.000043034963}, + {86739, 0.000043034963}, + {87381, 0.000043034963}, + {88033, 0.000043034963}, + {88695, 0.000043034963}, + {89367, 0.000043034963}, + {90049, 0.000043034963}, + {90742, 0.000043034963}, + {91446, 0.000043034963}, + {92160, 0.000043034963}, + {92886, 0.000043034963}, + {93623, 0.000043034963}, + {94372, 0.000043034963}, + {95133, 0.000043034963}, + {95906, 0.000043034963}, + {96692, 0.000043034963}, + {97492, 0.000043034963}, + {98304, 0.000043034963}, + {99130, 0.000043034963}, + {99970, 0.000043034963}, + {100825, 0.000043034963}, + {101694, 0.000043034963}, + {102578, 0.000043034963}, + {103478, 0.000043034963}, + {104394, 0.000043034963}, + {105326, 0.000043034963}, + {106275, 0.000043034963}, + {107241, 0.000043034963}, + {108225, 0.000043034963}, + {109227, 0.000043034963}, + {110247, 0.000043034963}, + {111288, 0.000043034963}, + {112347, 0.000043034963}, + {113428, 0.000043034963}, + {114529, 0.000043034963}, + {115652, 0.000043034963}, + {116797, 0.000043034963}, + {117965, 0.000043034963}, + {119156, 0.000043034963}, + {120372, 0.000043034963}, + {121613, 0.000043034963}, + {122880, 0.000043034963}, + {124173, 0.000043034963}, + {125494, 0.000043034963}, + {126844, 0.000043034963}, + {128223, 0.000043034963}, + {129632, 0.000043034963}, + {131072, 0.000043034963}, + {131630, 0.000011222285}, + {132192, 0.000011222285}, + {132760, 0.000011222285}, + {133332, 0.000011222285}, + {133909, 0.000011222285}, + {134491, 0.000011222285}, + {135079, 0.000011222285}, + {135671, 0.000011222285}, + {136269, 0.000011222285}, + {136872, 0.000011222285}, + {137480, 0.000011222285}, + {138094, 0.000011222285}, + {138713, 0.000011222285}, + {139338, 0.000011222285}, + {139968, 0.000011222285}, + {140605, 0.000011222285}, + {141247, 0.000011222285}, + {141894, 0.000011222285}, + {142548, 0.000011222285}, + {143208, 0.000011222285}, + {143874, 0.000011222285}, + {144547, 0.000011222285}, + {145225, 0.000011222285}, + {145910, 0.000011222285}, + {146602, 0.000011222285}, + {147300, 0.000011222285}, + {148005, 0.000011222285}, + {148716, 0.000011222285}, + {149435, 0.000011222285}, + {150160, 0.000011222285}, + {150893, 0.000011222285}, + {151632, 0.000011222285}, + {152379, 0.000011222285}, + {153134, 0.000011222285}, + {153895, 0.000011222285}, + {154665, 0.000011222285}, + {155442, 0.000011222285}, + {156227, 0.000011222285}, + {157020, 0.000011222285}, + {157821, 0.000011222285}, + {158631, 0.000011222285}, + {159448, 0.000011222285}, + {160275, 0.000011222285}, + {161109, 0.000011222285}, + {161953, 0.000011222285}, + {162805, 0.000011222285}, + {163667, 0.000011222285}, + {164537, 0.000011222285}, + {165417, 0.000011222285}, + {166306, 0.000011222285}, + {167205, 0.000011222285}, + {168114, 0.000011222285}, + {169033, 0.000011222285}, + {169961, 0.000011222285}, + {170901, 0.000011222285}, + {171850, 0.000011222285}, + {172810, 0.000011222285}, + {173781, 0.000011222285}, + {174763, 0.000011222285}, + {175756, 0.000011222285}, + {176760, 0.000011222285}, + {177776, 0.000011222285}, + {178803, 0.000011222285}, + {179843, 0.000011222285}, + {180895, 0.000011222285}, + {181959, 0.000011222285}, + {183035, 0.000011222285}, + {184125, 0.000011222285}, + {185227, 0.000011222285}, + {186343, 0.000011222285}, + {187473, 0.000011222285}, + {188616, 0.000011222285}, + {189773, 0.000011222285}, + {190944, 0.000011222285}, + {192130, 0.000011222285}, + {193331, 0.000011222285}, + {194547, 0.000011222285}, + {195778, 0.000011222285}, + {197025, 0.000011222285}, + {198288, 0.000011222285}, + {199568, 0.000011222285}, + {200864, 0.000011222285}, + {202176, 0.000011222285}, + {203507, 0.000011222285}, + {204854, 0.000011222285}, + {206220, 0.000011222285}, + {207604, 0.000011222285}, + {209007, 0.000011222285}, + {210429, 0.000011222285}, + {211870, 0.000011222285}, + {213331, 0.000011222285}, + {214812, 0.000011222285}, + {216315, 0.000011222285}, + {217838, 0.000011222285}, + {219383, 0.000011222285}, + {220950, 0.000011222285}, + {222540, 0.000011222285}, + {224152, 0.000011222285}, + {225788, 0.000011222285}, + {227448, 0.000011222285}, + {229133, 0.000011222285}, + {230843, 0.000011222285}, + {232579, 0.000011222285}, + {234341, 0.000011222285}, + {236130, 0.000011222285}, + {237946, 0.000011222285}, + {239791, 0.000011222285}, + {241664, 0.000011222285}, + {243567, 0.000011222285}, + {245500, 0.000011222285}, + {247464, 0.000011222285}, + {249460, 0.000011222285}, + {251488, 0.000011222285}, + {253549, 0.000011222285}, + {255645, 0.000011222285}, + {257775, 0.000011222285}, + {259941, 0.000011222285}, + {262144, 0.000011222285}, + {263003, 0.000006830897}, + {263869, 0.000006830897}, + {264739, 0.000006830897}, + {265616, 0.000006830897}, + {266499, 0.000006830897}, + {267387, 0.000006830897}, + {268281, 0.000006830897}, + {269181, 0.000006830897}, + {270088, 0.000006830897}, + {271000, 0.000006830897}, + {271919, 0.000006830897}, + {272844, 0.000006830897}, + {273775, 0.000006830897}, + {274713, 0.000006830897}, + {275657, 0.000006830897}, + {276607, 0.000006830897}, + {277564, 0.000006830897}, + {278528, 0.000006830897}, + {279498, 0.000006830897}, + {280476, 0.000006830897}, + {281460, 0.000006830897}, + {282451, 0.000006830897}, + {283449, 0.000006830897}, + {284454, 0.000006830897}, + {285466, 0.000006830897}, + {286486, 0.000006830897}, + {287513, 0.000006830897}, + {288547, 0.000006830897}, + {289589, 0.000006830897}, + {290638, 0.000006830897}, + {291695, 0.000006830897}, + {292759, 0.000006830897}, + {293832, 0.000006830897}, + {294912, 0.000006830897}, + {296000, 0.000006830897}, + {297097, 0.000006830897}, + {298201, 0.000006830897}, + {299314, 0.000006830897}, + {300435, 0.000006830897}, + {301564, 0.000006830897}, + {302702, 0.000006830897}, + {303849, 0.000006830897}, + {305004, 0.000006830897}, + {306168, 0.000006830897}, + {307341, 0.000006830897}, + {308523, 0.000006830897}, + {309715, 0.000006830897}, + {310915, 0.000006830897}, + {312125, 0.000006830897}, + {313344, 0.000006830897}, + {314573, 0.000006830897}, + {315811, 0.000006830897}, + {317060, 0.000006830897}, + {318318, 0.000006830897}, + {319586, 0.000006830897}, + {320864, 0.000006830897}, + {322153, 0.000006830897}, + {323452, 0.000006830897}, + {324761, 0.000006830897}, + {326082, 0.000006830897}, + {327413, 0.000006830897}, + {328754, 0.000006830897}, + {330107, 0.000006830897}, + {331471, 0.000006830897}, + {332847, 0.000006830897}, + {334234, 0.000006830897}, + {335632, 0.000006830897}, + {337042, 0.000006830897}, + {338464, 0.000006830897}, + {339899, 0.000006830897}, + {341345, 0.000006830897}, + {342804, 0.000006830897}, + {344275, 0.000006830897}, + {345759, 0.000006830897}, + {347256, 0.000006830897}, + {348765, 0.000006830897}, + {350288, 0.000006830897}, + {351825, 0.000006830897}, + {353375, 0.000006830897}, + {354938, 0.000006830897}, + {356516, 0.000006830897}, + {358107, 0.000006830897}, + {359713, 0.000006830897}, + {361334, 0.000006830897}, + {362969, 0.000006830897}, + {364618, 0.000006830897}, + {366283, 0.000006830897}, + {367964, 0.000006830897}, + {369659, 0.000006830897}, + {371371, 0.000006830897}, + {373098, 0.000006830897}, + {374841, 0.000006830897}, + {376601, 0.000006830897}, + {378378, 0.000006830897}, + {380171, 0.000006830897}, + {381981, 0.000006830897}, + {383809, 0.000006830897}, + {385654, 0.000006830897}, + {387517, 0.000006830897}, + {389398, 0.000006830897}, + {391298, 0.000006830897}, + {393216, 0.000006830897}, + {395153, 0.000006830897}, + {397109, 0.000006830897}, + {399085, 0.000006830897}, + {401080, 0.000006830897}, + {403096, 0.000006830897}, + {405132, 0.000006830897}, + {407188, 0.000006830897}, + {409266, 0.000006830897}, + {411364, 0.000006830897}, + {413485, 0.000006830897}, + {415627, 0.000006830897}, + {417792, 0.000006830897}, + {419979, 0.000006830897}, + {422190, 0.000006830897}, + {424424, 0.000006830897}, + {426681, 0.000006830897}, + {428963, 0.000006830897}, + {431269, 0.000006830897}, + {433600, 0.000006830897}, + {435957, 0.000006830897}, + {438339, 0.000006830897}, + {440748, 0.000006830897}, + {443183, 0.000006830897}, + {445645, 0.000006830897}, + {448134, 0.000006830897}, + {450652, 0.000006830897}, + {453198, 0.000006830897}, + {455773, 0.000006830897}, + {458378, 0.000006830897}, + {461012, 0.000006830897}, + {463677, 0.000006830897}, + {466372, 0.000006830897}, + {469100, 0.000006830897}, + {471859, 0.000006830897}, + {474651, 0.000006830897}, + {477477, 0.000006830897}, + {480336, 0.000006830897}, + {483229, 0.000006830897}, + {486158, 0.000006830897}, + {489122, 0.000006830897}, + {492123, 0.000006830897}, + {495161, 0.000006830897}, + {498236, 0.000006830897}, + {501350, 0.000006830897}, + {504504, 0.000006830897}, + {507697, 0.000006830897}, + {510930, 0.000006830897}, + {514206, 0.000006830897}, + {517523, 0.000006830897}, + {520884, 0.000006830897}, + {524288, 0.000006830897}, + {526844, 0.000002521458}, + {529425, 0.000002521458}, + {532031, 0.000002521458}, + {534663, 0.000002521458}, + {537322, 0.000002521458}, + {540006, 0.000002521458}, + {542718, 0.000002521458}, + {545458, 0.000002521458}, + {548225, 0.000002521458}, + {551020, 0.000002521458}, + {553844, 0.000002521458}, + {556697, 0.000002521458}, + {559579, 0.000002521458}, + {562492, 0.000002521458}, + {565435, 0.000002521458}, + {568408, 0.000002521458}, + {571414, 0.000002521458}, + {574451, 0.000002521458}, + {577521, 0.000002521458}, + {580624, 0.000002521458}, + {583760, 0.000002521458}, + {586931, 0.000002521458}, + {590136, 0.000002521458}, + {593376, 0.000002521458}, + {596652, 0.000002521458}, + {599964, 0.000002521458}, + {603313, 0.000002521458}, + {606700, 0.000002521458}, + {610125, 0.000002521458}, + {613590, 0.000002521458}, + {617093, 0.000002521458}, + {620637, 0.000002521458}, + {624222, 0.000002521458}, + {627848, 0.000002521458}, + {631517, 0.000002521458}, + {635229, 0.000002521458}, + {638985, 0.000002521458}, + {642786, 0.000002521458}, + {646632, 0.000002521458}, + {650524, 0.000002521458}, + {654463, 0.000002521458}, + {658451, 0.000002521458}, + {662487, 0.000002521458}, + {666573, 0.000002521458}, + {670710, 0.000002521458}, + {674899, 0.000002521458}, + {679140, 0.000002521458}, + {683435, 0.000002521458}, + {687784, 0.000002521458}, + {692190, 0.000002521458}, + {696652, 0.000002521458}, + {701172, 0.000002521458}, + {705750, 0.000002521458}, + {710390, 0.000002521458}, + {715090, 0.000002521458}, + {719853, 0.000002521458}, + {724680, 0.000002521458}, + {729573, 0.000002521458}, + {734531, 0.000002521458}, + {739558, 0.000002521458}, + {744654, 0.000002521458}, + {749820, 0.000002521458}, + {755059, 0.000002521458}, + {760371, 0.000002521458}, + {765759, 0.000002521458}, + {771224, 0.000002521458}, + {776767, 0.000002521458}, + {782390, 0.000002521458}, + {788096, 0.000002521458}, + {793885, 0.000002521458}, + {799760, 0.000002521458}, + {805723, 0.000002521458}, + {811775, 0.000002521458}, + {817919, 0.000002521458}, + {824156, 0.000002521458}, + {830490, 0.000002521458}, + {836921, 0.000002521458}, + {843453, 0.000002521458}, + {850088, 0.000002521458}, + {856827, 0.000002521458}, + {863675, 0.000002521458}, + {870633, 0.000002521458}, + {877704, 0.000002521458}, + {884890, 0.000002521458}, + {892196, 0.000002521458}, + {899623, 0.000002521458}, + {907174, 0.000002521458}, + {914854, 0.000002521458}, + {922664, 0.000002521458}, + {930609, 0.000002521458}, + {938693, 0.000002521458}, + {946917, 0.000002521458}, + {955288, 0.000002521458}, + {963807, 0.000002521458}, + {972480, 0.000002521458}, + {981310, 0.000002521458}, + {990302, 0.000002521458}, + {999461, 0.000002521458}, + {1008790, 0.000002521458}, + {1018296, 0.000002521458}, + {1027982, 0.000002521458}, + {1037854, 0.000002521458}, + {1047917, 0.000002521458}, + {1058178, 0.000002521458}, + {1068642, 0.000002521458}, + {1079314, 0.000002521458}, + {1090202, 0.000002521458}, + {1101312, 0.000002521458}, + {1112651, 0.000002521458}, + {1124225, 0.000002521458}, + {1136043, 0.000002521458}, + {1148112, 0.000002521458}, + {1160440, 0.000002521458}, + {1173036, 0.000002521458}, + {1185908, 0.000002521458}, + {1199066, 0.000002521458}, + {1212519, 0.000002521458}, + {1226277, 0.000002521458}, + {1240351, 0.000002521458}, + {1254752, 0.000002521458}, + {1269492, 0.000002521458}, + {1284581, 0.000002521458}, + {1300034, 0.000002521458}, + {1315863, 0.000002521458}, + {1332082, 0.000002521458}, + {1348706, 0.000002521458}, + {1365751, 0.000002521458}, + {1383231, 0.000002521458}, + {1401165, 0.000002521458}, + {1419570, 0.000002521458}, + {1438465, 0.000002521458}, + {1457870, 0.000002521458}, + {1477805, 0.000002521458}, + {1498294, 0.000002521458}, + {1519358, 0.000002521458}, + {1541023, 0.000002521458}, + {1563315, 0.000002521458}, + {1586261, 0.000002521458}, + {1609891, 0.000002521458}, + {1634235, 0.000002521458}, + {1659327, 0.000002521458}, + {1685202, 0.000002521458}, + {1711896, 0.000002521458}, + {1739450, 0.000002521458}, + {1767905, 0.000002521458}, + {1797307, 0.000002521458}, + {1827703, 0.000002521458}, + {1859145, 0.000002521458}, + {1891687, 0.000002521458}, + {1925389, 0.000002521458}, + {1960314, 0.000002521458}, + {1996529, 0.000002521458}, + {2034108, 0.000002521458}, + {2073128, 0.000002521458}, + {2113675, 0.000002521458}, + {2155839, 0.000002521458}, + {2199720, 0.000002521458}, + {2245424, 0.000002521458}, + {2293067, 0.000002521458}, + {2342777, 0.000002521458}, + {2394689, 0.000002521458}, + {2448954, 0.000002521458}, + {2505735, 0.000002521458}, + {2565212, 0.000002521458}, + {2627581, 0.000002521458}, + {2693059, 0.000002521458}, + {2761883, 0.000002521458}, + {2834317, 0.000002521458}, + {2910653, 0.000002521458}, + {2991214, 0.000002521458}, + {3076362, 0.000002521458}, + {3166500, 0.000002521458}, + {3262080, 0.000002521458}, + {3363608, 0.000002521458}, + {3471660, 0.000002521458}, + {3586885, 0.000002521458}, + {3710020, 0.000002521458}, + {3841911, 0.000002521458}, + {3983524, 0.000002521458}, + {4135977, 0.000002521458}, + {4300563, 0.000002521458}, + {4478791, 0.000002521458}, + {4672430, 0.000002521458}, + {4883570, 0.000002521458}, + {5114695, 0.000002521458}, + {5368784, 0.000002521458}, + {5649438, 0.000002521458}, + {5961053, 0.000002521458}, + {6309051, 0.000002521458}, + {6700199, 0.000002521458}, + {7143054, 0.000002521458}, + {7648594, 0.000002521458}, + {8231141, 0.000002521458}, + {8909743, 0.000002521458}, + {9710291, 0.000002521458}, + {10668901, 0.000002521458}, + {11837511, 0.000002521458}, + {13293619, 0.000002521458}, + {15158197, 0.000002521458}, + {0, 0}, }; -dist_point_gen::cdf_point w4[] = { - {53, 0.00074}, - {56, 0.00148}, - {60, 0.00222}, - {64, 0.00296}, - {68, 0.0037}, - {72, 0.00444}, - {77, 0.00518}, - {81, 0.00592}, - {87, 0.00666}, - {92, 0.0074}, - {100, 0.00815}, - {109, 0.0089}, - {119, 0.00965}, - {130, 0.0104}, - {141, 0.01115}, - {154, 0.0119}, - {168, 0.01265}, - {183, 0.0134}, - {199, 0.01415}, - {217, 0.0149}, - {222, 0.0175}, - {227, 0.0201}, - {232, 0.0227}, - {237, 0.0253}, - {243, 0.0279}, - {248, 0.0305}, - {254, 0.0331}, - {259, 0.0357}, - {265, 0.0383}, - {271, 0.0409}, - {274, 0.04201}, - {277, 0.04312}, - {279, 0.04423}, - {282, 0.04534}, - {285, 0.04645}, - {288, 0.04756}, - {291, 0.04867}, - {294, 0.04978}, - {297, 0.05089}, - {300, 0.052}, - {303, 0.06055}, - {305, 0.0691}, - {308, 0.07765}, - {310, 0.0862}, - {313, 0.09475}, - {315, 0.1033}, - {318, 0.11185}, - {321, 0.1204}, - {323, 0.12895}, - {326, 0.1375}, - {331, 0.14401}, - {335, 0.15052}, - {340, 0.15703}, - {345, 0.16354}, - {350, 0.17005}, - {355, 0.17656}, - {360, 0.18307}, - {365, 0.18958}, - {371, 0.19609}, - {376, 0.2026}, - {381, 0.20464}, - {385, 0.20668}, - {390, 0.20872}, - {395, 0.21076}, - {400, 0.2128}, - {405, 0.21484}, - {410, 0.21688}, - {415, 0.21892}, - {420, 0.22096}, - {425, 0.223}, - {430, 0.22709}, - {435, 0.23118}, - {441, 0.23527}, - {446, 0.23936}, - {452, 0.24345}, - {457, 0.24754}, - {463, 0.25163}, - {468, 0.25572}, - {474, 0.25981}, - {480, 0.2639}, - {491, 0.28342}, - {502, 0.30294}, - {513, 0.32246}, - {525, 0.34198}, - {537, 0.3615}, - {549, 0.38102}, - {561, 0.40054}, - {574, 0.42006}, - {587, 0.43958}, - {600, 0.4591}, - {607, 0.46486}, - {615, 0.47062}, - {623, 0.47638}, - {630, 0.48214}, - {638, 0.4879}, - {646, 0.49366}, - {654, 0.49942}, - {662, 0.50518}, - {671, 0.51094}, - {679, 0.5167}, - {685, 0.52023}, - {690, 0.52376}, - {696, 0.52729}, - {702, 0.53082}, - {707, 0.53435}, - {713, 0.53788}, - {719, 0.54141}, - {725, 0.54494}, - {731, 0.54847}, - {737, 0.552}, - {743, 0.55442}, - {749, 0.55684}, - {755, 0.55926}, - {762, 0.56168}, - {768, 0.5641}, - {774, 0.56652}, - {781, 0.56894}, - {787, 0.57136}, - {793, 0.57378}, - {800, 0.5762}, - {808, 0.5777}, - {816, 0.5792}, - {825, 0.5807}, - {833, 0.5822}, - {841, 0.5837}, - {850, 0.5852}, - {859, 0.5867}, - {867, 0.5882}, - {876, 0.5897}, - {885, 0.5912}, - {900, 0.59305}, - {914, 0.5949}, - {929, 0.59675}, - {945, 0.5986}, - {960, 0.60045}, - {976, 0.6023}, - {992, 0.60415}, - {1009, 0.606}, - {1025, 0.60785}, - {1042, 0.6097}, - {1063, 0.61155}, - {1085, 0.6134}, - {1108, 0.61525}, - {1130, 0.6171}, - {1154, 0.61895}, - {1177, 0.6208}, - {1201, 0.62265}, - {1226, 0.6245}, - {1251, 0.62635}, - {1277, 0.6282}, - {1290, 0.62932}, - {1303, 0.63044}, - {1316, 0.63156}, - {1330, 0.63268}, - {1343, 0.6338}, - {1357, 0.63492}, - {1371, 0.63604}, - {1385, 0.63716}, - {1399, 0.63828}, - {1413, 0.6394}, - {1425, 0.64014}, - {1436, 0.64088}, - {1448, 0.64162}, - {1460, 0.64236}, - {1472, 0.6431}, - {1484, 0.64384}, - {1497, 0.64458}, - {1509, 0.64532}, - {1521, 0.64606}, - {1534, 0.6468}, - {1547, 0.64885}, - {1559, 0.6509}, - {1572, 0.65295}, - {1585, 0.655}, - {1598, 0.65705}, - {1611, 0.6591}, - {1624, 0.66115}, - {1637, 0.6632}, - {1651, 0.66525}, - {1664, 0.6673}, - {1712, 0.66804}, - {1762, 0.66878}, - {1813, 0.66952}, - {1865, 0.67026}, - {1919, 0.671}, - {1975, 0.67174}, - {2032, 0.67248}, - {2091, 0.67322}, - {2152, 0.67396}, - {2214, 0.6747}, - {2311, 0.67563}, - {2412, 0.67656}, - {2517, 0.67749}, - {2627, 0.67842}, - {2742, 0.67935}, - {2862, 0.68028}, - {2987, 0.68121}, - {3118, 0.68214}, - {3254, 0.68307}, - {3396, 0.684}, - {3544, 0.68493}, - {3699, 0.68586}, - {3861, 0.68679}, - {4030, 0.68772}, - {4206, 0.68865}, - {4390, 0.68958}, - {4582, 0.69051}, - {4783, 0.69144}, - {4992, 0.69237}, - {5210, 0.6933}, - {5427, 0.69479}, - {5652, 0.69628}, - {5887, 0.69777}, - {6132, 0.69926}, - {6387, 0.70075}, - {6653, 0.70224}, - {6929, 0.70373}, - {7217, 0.70522}, - {7517, 0.70671}, - {7830, 0.7082}, - {9012, 0.7095}, - {10373, 0.7108}, - {11939, 0.7121}, - {13741, 0.7134}, - {15816, 0.7147}, - {18203, 0.716}, - {20952, 0.7173}, - {24115, 0.7186}, - {27756, 0.7199}, - {31946, 0.7212}, - {32405, 0.72213}, - {32871, 0.72306}, - {33343, 0.72399}, - {33822, 0.72492}, - {34308, 0.72585}, - {34801, 0.72678}, - {35301, 0.72771}, - {35808, 0.72864}, - {36322, 0.72957}, - {36844, 0.7305}, - {37146, 0.73143}, - {37450, 0.73236}, - {37756, 0.73329}, - {38065, 0.73422}, - {38377, 0.73515}, - {38691, 0.73608}, - {39007, 0.73701}, - {39327, 0.73794}, - {39648, 0.73887}, - {39973, 0.7398}, - {40382, 0.74166}, - {40796, 0.74352}, - {41214, 0.74538}, - {41636, 0.74724}, - {42062, 0.7491}, - {42493, 0.75096}, - {42928, 0.75282}, - {43367, 0.75468}, - {43811, 0.75654}, - {44260, 0.7584}, - {44531, 0.76044}, - {44804, 0.76248}, - {45079, 0.76452}, - {45356, 0.76656}, - {45634, 0.7686}, - {45913, 0.77064}, - {46195, 0.77268}, - {46478, 0.77472}, - {46763, 0.77676}, - {47050, 0.7788}, - {47435, 0.78252}, - {47823, 0.78624}, - {48215, 0.78996}, - {48609, 0.79368}, - {49007, 0.7974}, - {49408, 0.80112}, - {49813, 0.80484}, - {50221, 0.80856}, - {50632, 0.81228}, - {51046, 0.816}, - {52097, 0.81674}, - {53169, 0.81748}, - {54264, 0.81822}, - {55381, 0.81896}, - {56521, 0.8197}, - {57684, 0.82044}, - {58872, 0.82118}, - {60084, 0.82192}, - {61321, 0.82266}, - {62583, 0.8234}, - {63353, 0.82433}, - {64132, 0.82526}, - {64921, 0.82619}, - {65720, 0.82712}, - {66528, 0.82805}, - {67347, 0.82898}, - {68175, 0.82991}, - {69014, 0.83084}, - {69863, 0.83177}, - {70722, 0.8327}, - {71156, 0.83363}, - {71592, 0.83456}, - {72031, 0.83549}, - {72473, 0.83642}, - {72917, 0.83735}, - {73364, 0.83828}, - {73814, 0.83921}, - {74266, 0.84014}, - {74722, 0.84107}, - {75180, 0.842}, - {75795, 0.84497}, - {76416, 0.84794}, - {77041, 0.85091}, - {77672, 0.85388}, - {78307, 0.85685}, - {78948, 0.85982}, - {79595, 0.86279}, - {80246, 0.86576}, - {80903, 0.86873}, - {81565, 0.8717}, - {83414, 0.87338}, - {85305, 0.87506}, - {87238, 0.87674}, - {89216, 0.87842}, - {91238, 0.8801}, - {93306, 0.88178}, - {95421, 0.88346}, - {97584, 0.88514}, - {99796, 0.88682}, - {102058, 0.8885}, - {103313, 0.88943}, - {104584, 0.89036}, - {105871, 0.89129}, - {107173, 0.89222}, - {108492, 0.89315}, - {109826, 0.89408}, - {111177, 0.89501}, - {112545, 0.89594}, - {113929, 0.89687}, - {115331, 0.8978}, - {116988, 0.89872}, - {118669, 0.89964}, - {120373, 0.90056}, - {122103, 0.90148}, - {123857, 0.9024}, - {125636, 0.90332}, - {127441, 0.90424}, - {129272, 0.90516}, - {131129, 0.90608}, - {133013, 0.907}, - {135199, 0.90794}, - {137421, 0.90888}, - {139679, 0.90982}, - {141975, 0.91076}, - {144308, 0.9117}, - {146680, 0.91264}, - {149091, 0.91358}, - {151541, 0.91452}, - {154032, 0.91546}, - {156563, 0.9164}, - {159136, 0.91733}, - {161752, 0.91826}, - {164410, 0.91919}, - {167112, 0.92012}, - {169859, 0.92105}, - {172651, 0.92198}, - {175488, 0.92291}, - {178373, 0.92384}, - {181304, 0.92477}, - {184284, 0.9257}, - {186932, 0.92662}, - {189618, 0.92754}, - {192343, 0.92846}, - {195107, 0.92938}, - {197911, 0.9303}, - {200755, 0.93122}, - {203640, 0.93214}, - {206566, 0.93306}, - {209535, 0.93398}, - {212546, 0.9349}, - {215160, 0.93583}, - {217805, 0.93676}, - {220484, 0.93769}, - {223195, 0.93862}, - {225940, 0.93955}, - {228718, 0.94048}, - {231530, 0.94141}, - {234377, 0.94234}, - {237259, 0.94327}, - {240177, 0.9442}, - {244622, 0.94513}, - {249150, 0.94606}, - {253761, 0.94699}, - {258458, 0.94792}, - {263241, 0.94885}, - {268113, 0.94978}, - {273075, 0.95071}, - {278129, 0.95164}, - {283277, 0.95257}, - {288520, 0.9535}, - {302981, 0.95443}, - {318166, 0.95536}, - {334113, 0.95629}, - {350858, 0.95722}, - {368444, 0.95815}, - {386910, 0.95908}, - {406302, 0.96001}, - {426666, 0.96094}, - {448051, 0.96187}, - {470507, 0.9628}, - {492079, 0.96336}, - {514641, 0.96392}, - {538237, 0.96448}, - {562914, 0.96504}, - {588723, 0.9656}, - {615716, 0.96616}, - {643946, 0.96672}, - {673470, 0.96728}, - {704348, 0.96784}, - {736642, 0.9684}, - {754877, 0.96877}, - {773563, 0.96914}, - {792711, 0.96951}, - {812333, 0.96988}, - {832442, 0.97025}, - {853048, 0.97062}, - {874164, 0.97099}, - {895802, 0.97136}, - {917977, 0.97173}, - {940700, 0.9721}, +dist_point_gen::weight dist_point_gen::w4[] = { + {53, 0.000740000000}, + {56, 0.000740000000}, + {60, 0.000740000000}, + {64, 0.000740000000}, + {68, 0.000740000000}, + {72, 0.000740000000}, + {77, 0.000740000000}, + {81, 0.000740000000}, + {87, 0.000740000000}, + {92, 0.000740000000}, + {100, 0.000750000000}, + {109, 0.000750000000}, + {119, 0.000750000000}, + {130, 0.000750000000}, + {141, 0.000750000000}, + {154, 0.000750000000}, + {168, 0.000750000000}, + {183, 0.000750000000}, + {199, 0.000750000000}, + {217, 0.000750000000}, + {222, 0.002600000000}, + {227, 0.002600000000}, + {232, 0.002600000000}, + {237, 0.002600000000}, + {243, 0.002600000000}, + {248, 0.002600000000}, + {254, 0.002600000000}, + {259, 0.002600000000}, + {265, 0.002600000000}, + {271, 0.002600000000}, + {274, 0.001110000000}, + {277, 0.001110000000}, + {279, 0.001110000000}, + {282, 0.001110000000}, + {285, 0.001110000000}, + {288, 0.001110000000}, + {291, 0.001110000000}, + {294, 0.001110000000}, + {297, 0.001110000000}, + {300, 0.001110000000}, + {303, 0.008550000000}, + {305, 0.008550000000}, + {308, 0.008550000000}, + {310, 0.008550000000}, + {313, 0.008550000000}, + {315, 0.008550000000}, + {318, 0.008550000000}, + {321, 0.008550000000}, + {323, 0.008550000000}, + {326, 0.008550000000}, + {331, 0.006510000000}, + {335, 0.006510000000}, + {340, 0.006510000000}, + {345, 0.006510000000}, + {350, 0.006510000000}, + {355, 0.006510000000}, + {360, 0.006510000000}, + {365, 0.006510000000}, + {371, 0.006510000000}, + {376, 0.006510000000}, + {381, 0.002040000000}, + {385, 0.002040000000}, + {390, 0.002040000000}, + {395, 0.002040000000}, + {400, 0.002040000000}, + {405, 0.002040000000}, + {410, 0.002040000000}, + {415, 0.002040000000}, + {420, 0.002040000000}, + {425, 0.002040000000}, + {430, 0.004090000000}, + {435, 0.004090000000}, + {441, 0.004090000000}, + {446, 0.004090000000}, + {452, 0.004090000000}, + {457, 0.004090000000}, + {463, 0.004090000000}, + {468, 0.004090000000}, + {474, 0.004090000000}, + {480, 0.004090000000}, + {491, 0.019520000000}, + {502, 0.019520000000}, + {513, 0.019520000000}, + {525, 0.019520000000}, + {537, 0.019520000000}, + {549, 0.019520000000}, + {561, 0.019520000000}, + {574, 0.019520000000}, + {587, 0.019520000000}, + {600, 0.019520000000}, + {607, 0.005760000000}, + {615, 0.005760000000}, + {623, 0.005760000000}, + {630, 0.005760000000}, + {638, 0.005760000000}, + {646, 0.005760000000}, + {654, 0.005760000000}, + {662, 0.005760000000}, + {671, 0.005760000000}, + {679, 0.005760000000}, + {685, 0.003530000000}, + {690, 0.003530000000}, + {696, 0.003530000000}, + {702, 0.003530000000}, + {707, 0.003530000000}, + {713, 0.003530000000}, + {719, 0.003530000000}, + {725, 0.003530000000}, + {731, 0.003530000000}, + {737, 0.003530000000}, + {743, 0.002420000000}, + {749, 0.002420000000}, + {755, 0.002420000000}, + {762, 0.002420000000}, + {768, 0.002420000000}, + {774, 0.002420000000}, + {781, 0.002420000000}, + {787, 0.002420000000}, + {793, 0.002420000000}, + {800, 0.002420000000}, + {808, 0.001500000000}, + {816, 0.001500000000}, + {825, 0.001500000000}, + {833, 0.001500000000}, + {841, 0.001500000000}, + {850, 0.001500000000}, + {859, 0.001500000000}, + {867, 0.001500000000}, + {876, 0.001500000000}, + {885, 0.001500000000}, + {900, 0.001850000000}, + {914, 0.001850000000}, + {929, 0.001850000000}, + {945, 0.001850000000}, + {960, 0.001850000000}, + {976, 0.001850000000}, + {992, 0.001850000000}, + {1009, 0.001850000000}, + {1025, 0.001850000000}, + {1042, 0.001850000000}, + {1063, 0.001850000000}, + {1085, 0.001850000000}, + {1108, 0.001850000000}, + {1130, 0.001850000000}, + {1154, 0.001850000000}, + {1177, 0.001850000000}, + {1201, 0.001850000000}, + {1226, 0.001850000000}, + {1251, 0.001850000000}, + {1277, 0.001850000000}, + {1290, 0.001120000000}, + {1303, 0.001120000000}, + {1316, 0.001120000000}, + {1330, 0.001120000000}, + {1343, 0.001120000000}, + {1357, 0.001120000000}, + {1371, 0.001120000000}, + {1385, 0.001120000000}, + {1399, 0.001120000000}, + {1413, 0.001120000000}, + {1425, 0.000740000000}, + {1436, 0.000740000000}, + {1448, 0.000740000000}, + {1460, 0.000740000000}, + {1472, 0.000740000000}, + {1484, 0.000740000000}, + {1497, 0.000740000000}, + {1509, 0.000740000000}, + {1521, 0.000740000000}, + {1534, 0.000740000000}, + {1547, 0.002050000000}, + {1559, 0.002050000000}, + {1572, 0.002050000000}, + {1585, 0.002050000000}, + {1598, 0.002050000000}, + {1611, 0.002050000000}, + {1624, 0.002050000000}, + {1637, 0.002050000000}, + {1651, 0.002050000000}, + {1664, 0.002050000000}, + {1712, 0.000740000000}, + {1762, 0.000740000000}, + {1813, 0.000740000000}, + {1865, 0.000740000000}, + {1919, 0.000740000000}, + {1975, 0.000740000000}, + {2032, 0.000740000000}, + {2091, 0.000740000000}, + {2152, 0.000740000000}, + {2214, 0.000740000000}, + {2311, 0.000930000000}, + {2412, 0.000930000000}, + {2517, 0.000930000000}, + {2627, 0.000930000000}, + {2742, 0.000930000000}, + {2862, 0.000930000000}, + {2987, 0.000930000000}, + {3118, 0.000930000000}, + {3254, 0.000930000000}, + {3396, 0.000930000000}, + {3544, 0.000930000000}, + {3699, 0.000930000000}, + {3861, 0.000930000000}, + {4030, 0.000930000000}, + {4206, 0.000930000000}, + {4390, 0.000930000000}, + {4582, 0.000930000000}, + {4783, 0.000930000000}, + {4992, 0.000930000000}, + {5210, 0.000930000000}, + {5427, 0.001490000000}, + {5652, 0.001490000000}, + {5887, 0.001490000000}, + {6132, 0.001490000000}, + {6387, 0.001490000000}, + {6653, 0.001490000000}, + {6929, 0.001490000000}, + {7217, 0.001490000000}, + {7517, 0.001490000000}, + {7830, 0.001490000000}, + {9012, 0.001300000000}, + {10373, 0.001300000000}, + {11939, 0.001300000000}, + {13741, 0.001300000000}, + {15816, 0.001300000000}, + {18203, 0.001300000000}, + {20952, 0.001300000000}, + {24115, 0.001300000000}, + {27756, 0.001300000000}, + {31946, 0.001300000000}, + {32405, 0.000930000000}, + {32871, 0.000930000000}, + {33343, 0.000930000000}, + {33822, 0.000930000000}, + {34308, 0.000930000000}, + {34801, 0.000930000000}, + {35301, 0.000930000000}, + {35808, 0.000930000000}, + {36322, 0.000930000000}, + {36844, 0.000930000000}, + {37146, 0.000930000000}, + {37450, 0.000930000000}, + {37756, 0.000930000000}, + {38065, 0.000930000000}, + {38377, 0.000930000000}, + {38691, 0.000930000000}, + {39007, 0.000930000000}, + {39327, 0.000930000000}, + {39648, 0.000930000000}, + {39973, 0.000930000000}, + {40382, 0.001860000000}, + {40796, 0.001860000000}, + {41214, 0.001860000000}, + {41636, 0.001860000000}, + {42062, 0.001860000000}, + {42493, 0.001860000000}, + {42928, 0.001860000000}, + {43367, 0.001860000000}, + {43811, 0.001860000000}, + {44260, 0.001860000000}, + {44531, 0.002040000000}, + {44804, 0.002040000000}, + {45079, 0.002040000000}, + {45356, 0.002040000000}, + {45634, 0.002040000000}, + {45913, 0.002040000000}, + {46195, 0.002040000000}, + {46478, 0.002040000000}, + {46763, 0.002040000000}, + {47050, 0.002040000000}, + {47435, 0.003720000000}, + {47823, 0.003720000000}, + {48215, 0.003720000000}, + {48609, 0.003720000000}, + {49007, 0.003720000000}, + {49408, 0.003720000000}, + {49813, 0.003720000000}, + {50221, 0.003720000000}, + {50632, 0.003720000000}, + {51046, 0.003720000000}, + {52097, 0.000740000000}, + {53169, 0.000740000000}, + {54264, 0.000740000000}, + {55381, 0.000740000000}, + {56521, 0.000740000000}, + {57684, 0.000740000000}, + {58872, 0.000740000000}, + {60084, 0.000740000000}, + {61321, 0.000740000000}, + {62583, 0.000740000000}, + {63353, 0.000930000000}, + {64132, 0.000930000000}, + {64921, 0.000930000000}, + {65720, 0.000930000000}, + {66528, 0.000930000000}, + {67347, 0.000930000000}, + {68175, 0.000930000000}, + {69014, 0.000930000000}, + {69863, 0.000930000000}, + {70722, 0.000930000000}, + {71156, 0.000930000000}, + {71592, 0.000930000000}, + {72031, 0.000930000000}, + {72473, 0.000930000000}, + {72917, 0.000930000000}, + {73364, 0.000930000000}, + {73814, 0.000930000000}, + {74266, 0.000930000000}, + {74722, 0.000930000000}, + {75180, 0.000930000000}, + {75795, 0.002970000000}, + {76416, 0.002970000000}, + {77041, 0.002970000000}, + {77672, 0.002970000000}, + {78307, 0.002970000000}, + {78948, 0.002970000000}, + {79595, 0.002970000000}, + {80246, 0.002970000000}, + {80903, 0.002970000000}, + {81565, 0.002970000000}, + {83414, 0.001680000000}, + {85305, 0.001680000000}, + {87238, 0.001680000000}, + {89216, 0.001680000000}, + {91238, 0.001680000000}, + {93306, 0.001680000000}, + {95421, 0.001680000000}, + {97584, 0.001680000000}, + {99796, 0.001680000000}, + {102058, 0.001680000000}, + {103313, 0.000930000000}, + {104584, 0.000930000000}, + {105871, 0.000930000000}, + {107173, 0.000930000000}, + {108492, 0.000930000000}, + {109826, 0.000930000000}, + {111177, 0.000930000000}, + {112545, 0.000930000000}, + {113929, 0.000930000000}, + {115331, 0.000930000000}, + {116988, 0.000920000000}, + {118669, 0.000920000000}, + {120373, 0.000920000000}, + {122103, 0.000920000000}, + {123857, 0.000920000000}, + {125636, 0.000920000000}, + {127441, 0.000920000000}, + {129272, 0.000920000000}, + {131129, 0.000920000000}, + {133013, 0.000920000000}, + {135199, 0.000940000000}, + {137421, 0.000940000000}, + {139679, 0.000940000000}, + {141975, 0.000940000000}, + {144308, 0.000940000000}, + {146680, 0.000940000000}, + {149091, 0.000940000000}, + {151541, 0.000940000000}, + {154032, 0.000940000000}, + {156563, 0.000940000000}, + {159136, 0.000930000000}, + {161752, 0.000930000000}, + {164410, 0.000930000000}, + {167112, 0.000930000000}, + {169859, 0.000930000000}, + {172651, 0.000930000000}, + {175488, 0.000930000000}, + {178373, 0.000930000000}, + {181304, 0.000930000000}, + {184284, 0.000930000000}, + {186932, 0.000920000000}, + {189618, 0.000920000000}, + {192343, 0.000920000000}, + {195107, 0.000920000000}, + {197911, 0.000920000000}, + {200755, 0.000920000000}, + {203640, 0.000920000000}, + {206566, 0.000920000000}, + {209535, 0.000920000000}, + {212546, 0.000920000000}, + {215160, 0.000930000000}, + {217805, 0.000930000000}, + {220484, 0.000930000000}, + {223195, 0.000930000000}, + {225940, 0.000930000000}, + {228718, 0.000930000000}, + {231530, 0.000930000000}, + {234377, 0.000930000000}, + {237259, 0.000930000000}, + {240177, 0.000930000000}, + {244622, 0.000930000000}, + {249150, 0.000930000000}, + {253761, 0.000930000000}, + {258458, 0.000930000000}, + {263241, 0.000930000000}, + {268113, 0.000930000000}, + {273075, 0.000930000000}, + {278129, 0.000930000000}, + {283277, 0.000930000000}, + {288520, 0.000930000000}, + {302981, 0.000930000000}, + {318166, 0.000930000000}, + {334113, 0.000930000000}, + {350858, 0.000930000000}, + {368444, 0.000930000000}, + {386910, 0.000930000000}, + {406302, 0.000930000000}, + {426666, 0.000930000000}, + {448051, 0.000930000000}, + {470507, 0.000930000000}, + {492079, 0.000560000000}, + {514641, 0.000560000000}, + {538237, 0.000560000000}, + {562914, 0.000560000000}, + {588723, 0.000560000000}, + {615716, 0.000560000000}, + {643946, 0.000560000000}, + {673470, 0.000560000000}, + {704348, 0.000560000000}, + {736642, 0.000560000000}, + {754877, 0.000370000000}, + {773563, 0.000370000000}, + {792711, 0.000370000000}, + {812333, 0.000370000000}, + {832442, 0.000370000000}, + {853048, 0.000370000000}, + {874164, 0.000370000000}, + {895802, 0.000370000000}, + {917977, 0.000370000000}, + {940700, 0.000370000000}, + {1002039, 0.000930000000}, + {1067379, 0.000930000000}, + {1136978, 0.000930000000}, + {1211116, 0.000930000000}, + {1290088, 0.000930000000}, + {1374210, 0.000930000000}, + {1463817, 0.000930000000}, + {1559267, 0.000930000000}, + {1660940, 0.000930000000}, + {1769244, 0.000930000000}, + {1805665, 0.000560000000}, + {1842837, 0.000560000000}, + {1880773, 0.000560000000}, + {1919490, 0.000560000000}, + {1959005, 0.000560000000}, + {1999332, 0.000560000000}, + {2040490, 0.000560000000}, + {2082496, 0.000560000000}, + {2125366, 0.000560000000}, + {2169118, 0.000560000000}, + {2527289, 0.001300000000}, + {2944602, 0.001300000000}, + {3430822, 0.001300000000}, + {3997329, 0.001300000000}, + {4657379, 0.001300000000}, + {5426418, 0.001300000000}, + {6322443, 0.001300000000}, + {7366423, 0.001300000000}, + {8582787, 0.001300000000}, + {10000000, 0.001300000000}, + {0, 0}, +}; - /* Alternate form that doesn't flatten out when the maximum message - * size is 1000000. - */ -#if 0 - {920000, 0.9721}, - {925000, 0.974}, - {930000, 0.976}, - {935000, 0.978}, - {940000, 0.980}, - {945000, 0.982}, - {950000, 0.984}, - {955000, 0.986}, - {960000, 0.988}, - {965000, 0.990}, - {970000, 0.992}, - {975000, 0.994}, - {980000, 0.996}, - {990000, 0.998}, - {1000000, 1.0}, -#else - {1002039, 0.97303}, - {1067379, 0.97396}, - {1136978, 0.97489}, - {1211116, 0.97582}, - {1290088, 0.97675}, - {1374210, 0.97768}, - {1463817, 0.97861}, - {1559267, 0.97954}, - {1660940, 0.98047}, - {1769244, 0.9814}, - {1805665, 0.98196}, - {1842837, 0.98252}, - {1880773, 0.98308}, - {1919490, 0.98364}, - {1959005, 0.9842}, - {1999332, 0.98476}, - {2040490, 0.98532}, - {2082496, 0.98588}, - {2125366, 0.98644}, - {2169118, 0.987}, - {2527289, 0.9883}, - {2944602, 0.9896}, - {3430822, 0.9909}, - {3997329, 0.9922}, - {4657379, 0.9935}, - {5426418, 0.9948}, - {6322443, 0.9961}, - {7366423, 0.9974}, - {8582787, 0.9987}, - {10000000, 1.0}, -#endif +dist_point_gen::weight dist_point_gen::w5[] = { + {1430, 0.025000000000}, + {2860, 0.025000000000}, + {4290, 0.025000000000}, + {5720, 0.025000000000}, + {7150, 0.025000000000}, + {8580, 0.025000000000}, + {10010, 0.007143000000}, + {11440, 0.007143000000}, + {13230, 0.007143000000}, + {14700, 0.007142000000}, + {16170, 0.007143000000}, + {17640, 0.007143000000}, + {19110, 0.007143000000}, + {20580, 0.016667000000}, + {22050, 0.016666000000}, + {23520, 0.016667000000}, + {24990, 0.016667000000}, + {26460, 0.016666000000}, + {27930, 0.016667000000}, + {29400, 0.007143000000}, + {30870, 0.007143000000}, + {32340, 0.007143000000}, + {33810, 0.007142000000}, + {35280, 0.007143000000}, + {36750, 0.007143000000}, + {38220, 0.007143000000}, + {39690, 0.007143000000}, + {41160, 0.007143000000}, + {42630, 0.007143000000}, + {44100, 0.007142000000}, + {45570, 0.007143000000}, + {47040, 0.007143000000}, + {48510, 0.007143000000}, + {49980, 0.006500000000}, + {51450, 0.006500000000}, + {52920, 0.006500000000}, + {54390, 0.006500000000}, + {55860, 0.006500000000}, + {57330, 0.006500000000}, + {58800, 0.006500000000}, + {60270, 0.006500000000}, + {61740, 0.006500000000}, + {63210, 0.006500000000}, + {64680, 0.006500000000}, + {66150, 0.006500000000}, + {67620, 0.006500000000}, + {69090, 0.006500000000}, + {70560, 0.006500000000}, + {72030, 0.006500000000}, + {73500, 0.006500000000}, + {74970, 0.006500000000}, + {76440, 0.006500000000}, + {77910, 0.006500000000}, + {79380, 0.000875000000}, + {80850, 0.000875000000}, + {82320, 0.000875000000}, + {83790, 0.000875000000}, + {85260, 0.000875000000}, + {86730, 0.000875000000}, + {88200, 0.000875000000}, + {89670, 0.000875000000}, + {91140, 0.000875000000}, + {92610, 0.000875000000}, + {94080, 0.000875000000}, + {95550, 0.000875000000}, + {97020, 0.000875000000}, + {98490, 0.000875000000}, + {99960, 0.000875000000}, + {101430, 0.000875000000}, + {102900, 0.000875000000}, + {104370, 0.000875000000}, + {105840, 0.000875000000}, + {107310, 0.000875000000}, + {108780, 0.000875000000}, + {110250, 0.000875000000}, + {111720, 0.000875000000}, + {113190, 0.000875000000}, + {114660, 0.000875000000}, + {116130, 0.000875000000}, + {117600, 0.000875000000}, + {119070, 0.000875000000}, + {120540, 0.000875000000}, + {122010, 0.000875000000}, + {123480, 0.000875000000}, + {124950, 0.000875000000}, + {126420, 0.000875000000}, + {127890, 0.000875000000}, + {129360, 0.000875000000}, + {130830, 0.000875000000}, + {132300, 0.000875000000}, + {133770, 0.000875000000}, + {135240, 0.000875000000}, + {136710, 0.000875000000}, + {138180, 0.000875000000}, + {139650, 0.000875000000}, + {141120, 0.000875000000}, + {142590, 0.000875000000}, + {144060, 0.000875000000}, + {145530, 0.000875000000}, + {147000, 0.000875000000}, + {148470, 0.000875000000}, + {149940, 0.000875000000}, + {151410, 0.000875000000}, + {152880, 0.000875000000}, + {154350, 0.000875000000}, + {155820, 0.000875000000}, + {157290, 0.000875000000}, + {158760, 0.000875000000}, + {160230, 0.000875000000}, + {161700, 0.000875000000}, + {163170, 0.000875000000}, + {164640, 0.000875000000}, + {166110, 0.000875000000}, + {167580, 0.000875000000}, + {169050, 0.000875000000}, + {170520, 0.000875000000}, + {171990, 0.000875000000}, + {173460, 0.000875000000}, + {174930, 0.000875000000}, + {176400, 0.000875000000}, + {177870, 0.000875000000}, + {179340, 0.000875000000}, + {180810, 0.000875000000}, + {182280, 0.000875000000}, + {183750, 0.000875000000}, + {185220, 0.000875000000}, + {186690, 0.000875000000}, + {188160, 0.000875000000}, + {189630, 0.000875000000}, + {191100, 0.000875000000}, + {192570, 0.000875000000}, + {194040, 0.000875000000}, + {195510, 0.000875000000}, + {196980, 0.000187000000}, + {198450, 0.000188000000}, + {199920, 0.000187000000}, + {201390, 0.000187000000}, + {202860, 0.000187000000}, + {204330, 0.000188000000}, + {205800, 0.000187000000}, + {207270, 0.000187000000}, + {208740, 0.000187000000}, + {210210, 0.000188000000}, + {211680, 0.000187000000}, + {213150, 0.000187000000}, + {214620, 0.000187000000}, + {216090, 0.000188000000}, + {217560, 0.000187000000}, + {219030, 0.000187000000}, + {220500, 0.000188000000}, + {221970, 0.000187000000}, + {223440, 0.000187000000}, + {224910, 0.000187000000}, + {226380, 0.000188000000}, + {227850, 0.000187000000}, + {229320, 0.000187000000}, + {230790, 0.000187000000}, + {232260, 0.000188000000}, + {233730, 0.000187000000}, + {235200, 0.000187000000}, + {236670, 0.000187000000}, + {238140, 0.000188000000}, + {239610, 0.000187000000}, + {241080, 0.000187000000}, + {242550, 0.000188000000}, + {244020, 0.000187000000}, + {245490, 0.000187000000}, + {246960, 0.000187000000}, + {248430, 0.000188000000}, + {249900, 0.000187000000}, + {251370, 0.000187000000}, + {252840, 0.000187000000}, + {254310, 0.000188000000}, + {255780, 0.000187000000}, + {257250, 0.000187000000}, + {258720, 0.000187000000}, + {260190, 0.000188000000}, + {261660, 0.000187000000}, + {263130, 0.000187000000}, + {264600, 0.000187000000}, + {266070, 0.000188000000}, + {267540, 0.000187000000}, + {269010, 0.000187000000}, + {270480, 0.000188000000}, + {271950, 0.000187000000}, + {273420, 0.000187000000}, + {274890, 0.000187000000}, + {276360, 0.000188000000}, + {277830, 0.000187000000}, + {279300, 0.000187000000}, + {280770, 0.000187000000}, + {282240, 0.000188000000}, + {283710, 0.000187000000}, + {285180, 0.000187000000}, + {286650, 0.000187000000}, + {288120, 0.000188000000}, + {289590, 0.000187000000}, + {291060, 0.000187000000}, + {292530, 0.000188000000}, + {294000, 0.000187000000}, + {295470, 0.000187000000}, + {296940, 0.000187000000}, + {298410, 0.000188000000}, + {299880, 0.000187000000}, + {301350, 0.000187000000}, + {302820, 0.000187000000}, + {304290, 0.000188000000}, + {305760, 0.000187000000}, + {307230, 0.000187000000}, + {308700, 0.000187000000}, + {310170, 0.000188000000}, + {311640, 0.000187000000}, + {313110, 0.000187000000}, + {314580, 0.000188000000}, + {316050, 0.000187000000}, + {317520, 0.000187000000}, + {318990, 0.000187000000}, + {320460, 0.000188000000}, + {321930, 0.000187000000}, + {323400, 0.000187000000}, + {324870, 0.000187000000}, + {326340, 0.000188000000}, + {327810, 0.000187000000}, + {329280, 0.000187000000}, + {330750, 0.000187000000}, + {332220, 0.000188000000}, + {333690, 0.000187000000}, + {335160, 0.000187000000}, + {336630, 0.000188000000}, + {338100, 0.000187000000}, + {339570, 0.000187000000}, + {341040, 0.000187000000}, + {342510, 0.000188000000}, + {343980, 0.000187000000}, + {345450, 0.000187000000}, + {346920, 0.000187000000}, + {348390, 0.000188000000}, + {349860, 0.000187000000}, + {351330, 0.000187000000}, + {352800, 0.000187000000}, + {354270, 0.000188000000}, + {355740, 0.000187000000}, + {357210, 0.000187000000}, + {358680, 0.000188000000}, + {360150, 0.000187000000}, + {361620, 0.000187000000}, + {363090, 0.000187000000}, + {364560, 0.000188000000}, + {373380, 0.001123000000}, + {392490, 0.002435000000}, + {411600, 0.002434000000}, + {432180, 0.002622000000}, + {451290, 0.002434000000}, + {470400, 0.002435000000}, + {490980, 0.002621000000}, + {510090, 0.002435000000}, + {529200, 0.002434000000}, + {549780, 0.002622000000}, + {568890, 0.002435000000}, + {588000, 0.002434000000}, + {608580, 0.002622000000}, + {627690, 0.002434000000}, + {648270, 0.002622000000}, + {667380, 0.002434000000}, + {686490, 0.002435000000}, + {707070, 0.002622000000}, + {726180, 0.002434000000}, + {745290, 0.002434000000}, + {765870, 0.002622000000}, + {784980, 0.002435000000}, + {804090, 0.002434000000}, + {824670, 0.002622000000}, + {843780, 0.002434000000}, + {862890, 0.002435000000}, + {883470, 0.002621000000}, + {902580, 0.002435000000}, + {921690, 0.002434000000}, + {942270, 0.002622000000}, + {961380, 0.002435000000}, + {980490, 0.002434000000}, + {1005480, 0.002553000000}, + {1030470, 0.002552000000}, + {1053990, 0.002403000000}, + {1078980, 0.002552000000}, + {1103970, 0.002553000000}, + {1127490, 0.002402000000}, + {1152480, 0.002553000000}, + {1177470, 0.002552000000}, + {1200990, 0.002403000000}, + {1225980, 0.002552000000}, + {1250970, 0.002553000000}, + {1274490, 0.002402000000}, + {1299480, 0.002553000000}, + {1324470, 0.002552000000}, + {1347990, 0.002403000000}, + {1372980, 0.002552000000}, + {1397970, 0.002553000000}, + {1421490, 0.002402000000}, + {1446480, 0.002553000000}, + {1470000, 0.002402000000}, + {1494990, 0.002553000000}, + {1519980, 0.002552000000}, + {1543500, 0.002403000000}, + {1568490, 0.002552000000}, + {1593480, 0.002553000000}, + {1617000, 0.002402000000}, + {1641990, 0.002553000000}, + {1666980, 0.002552000000}, + {1690500, 0.002403000000}, + {1715490, 0.002552000000}, + {1740480, 0.002553000000}, + {1764000, 0.002402000000}, + {1788990, 0.002553000000}, + {1813980, 0.002552000000}, + {1837500, 0.002403000000}, + {1862490, 0.002552000000}, + {1887480, 0.002553000000}, + {1911000, 0.002402000000}, + {1935990, 0.002553000000}, + {1959510, 0.002402000000}, + {2033010, 0.002500000000}, + {2106510, 0.002500000000}, + {2180010, 0.002500000000}, + {2253510, 0.002500000000}, + {2327010, 0.002500000000}, + {2400510, 0.002500000000}, + {2474010, 0.002500000000}, + {2547510, 0.002500000000}, + {2621010, 0.002500000000}, + {2694510, 0.002500000000}, + {2768010, 0.002500000000}, + {2841510, 0.002500000000}, + {2915010, 0.002500000000}, + {2988510, 0.002500000000}, + {3062010, 0.002500000000}, + {3135510, 0.002500000000}, + {3209010, 0.002500000000}, + {3282510, 0.002500000000}, + {3356010, 0.002500000000}, + {3429510, 0.002500000000}, + {3503010, 0.002500000000}, + {3576510, 0.002500000000}, + {3650010, 0.002500000000}, + {3723510, 0.002500000000}, + {3797010, 0.002500000000}, + {3870510, 0.002500000000}, + {3944010, 0.002500000000}, + {4017510, 0.002500000000}, + {4091010, 0.002500000000}, + {4164510, 0.002500000000}, + {4238010, 0.002500000000}, + {4311510, 0.002500000000}, + {4385010, 0.002500000000}, + {4458510, 0.002500000000}, + {4532010, 0.002500000000}, + {4605510, 0.002500000000}, + {4679010, 0.002500000000}, + {4752510, 0.002500000000}, + {4826010, 0.002500000000}, + {4899510, 0.002500000000}, + {10291470, 0.002505000000}, + {10780980, 0.002498000000}, + {11270490, 0.002497000000}, + {11761470, 0.002505000000}, + {12250980, 0.002498000000}, + {12740490, 0.002497000000}, + {13231470, 0.002505000000}, + {13720980, 0.002498000000}, + {14210490, 0.002498000000}, + {14701470, 0.002505000000}, + {15190980, 0.002497000000}, + {15680490, 0.002498000000}, + {16171470, 0.002505000000}, + {16660980, 0.002497000000}, + {17150490, 0.002498000000}, + {17641470, 0.002505000000}, + {18130980, 0.002498000000}, + {18620490, 0.002497000000}, + {19111470, 0.002505000000}, + {19600980, 0.002498000000}, + {20090490, 0.002497000000}, + {20581470, 0.002505000000}, + {21070980, 0.002498000000}, + {21560490, 0.002498000000}, + {22051470, 0.002505000000}, + {22540980, 0.002497000000}, + {23030490, 0.002498000000}, + {23521470, 0.002505000000}, + {24010980, 0.002497000000}, + {24500490, 0.002498000000}, + {24991470, 0.002505000000}, + {25480980, 0.002498000000}, + {25970490, 0.002497000000}, + {26461470, 0.002505000000}, + {26950980, 0.002498000000}, + {27440490, 0.002497000000}, + {27931470, 0.002505000000}, + {28420980, 0.002498000000}, + {28910490, 0.002497000000}, + {29400000, 0.002498000000}, + {0, 0}, }; -dist_point_gen::cdf_point w5[] = { - {1430, 0.025000}, - {2860, 0.050000}, - {4290, 0.075000}, - {5720, 0.100000}, - {7150, 0.125000}, - {8580, 0.150000}, - {10010, 0.157143}, - {11440, 0.164286}, - {13230, 0.171429}, - {14700, 0.178571}, - {16170, 0.185714}, - {17640, 0.192857}, - {19110, 0.200000}, - {20580, 0.216667}, - {22050, 0.233333}, - {23520, 0.250000}, - {24990, 0.266667}, - {26460, 0.283333}, - {27930, 0.300000}, - {29400, 0.307143}, - {30870, 0.314286}, - {32340, 0.321429}, - {33810, 0.328571}, - {35280, 0.335714}, - {36750, 0.342857}, - {38220, 0.350000}, - {39690, 0.357143}, - {41160, 0.364286}, - {42630, 0.371429}, - {44100, 0.378571}, - {45570, 0.385714}, - {47040, 0.392857}, - {48510, 0.400000}, - {49980, 0.406500}, - {51450, 0.413000}, - {52920, 0.419500}, - {54390, 0.426000}, - {55860, 0.432500}, - {57330, 0.439000}, - {58800, 0.445500}, - {60270, 0.452000}, - {61740, 0.458500}, - {63210, 0.465000}, - {64680, 0.471500}, - {66150, 0.478000}, - {67620, 0.484500}, - {69090, 0.491000}, - {70560, 0.497500}, - {72030, 0.504000}, - {73500, 0.510500}, - {74970, 0.517000}, - {76440, 0.523500}, - {77910, 0.530000}, - {79380, 0.530875}, - {80850, 0.531750}, - {82320, 0.532625}, - {83790, 0.533500}, - {85260, 0.534375}, - {86730, 0.535250}, - {88200, 0.536125}, - {89670, 0.537000}, - {91140, 0.537875}, - {92610, 0.538750}, - {94080, 0.539625}, - {95550, 0.540500}, - {97020, 0.541375}, - {98490, 0.542250}, - {99960, 0.543125}, - {101430, 0.544000}, - {102900, 0.544875}, - {104370, 0.545750}, - {105840, 0.546625}, - {107310, 0.547500}, - {108780, 0.548375}, - {110250, 0.549250}, - {111720, 0.550125}, - {113190, 0.551000}, - {114660, 0.551875}, - {116130, 0.552750}, - {117600, 0.553625}, - {119070, 0.554500}, - {120540, 0.555375}, - {122010, 0.556250}, - {123480, 0.557125}, - {124950, 0.558000}, - {126420, 0.558875}, - {127890, 0.559750}, - {129360, 0.560625}, - {130830, 0.561500}, - {132300, 0.562375}, - {133770, 0.563250}, - {135240, 0.564125}, - {136710, 0.565000}, - {138180, 0.565875}, - {139650, 0.566750}, - {141120, 0.567625}, - {142590, 0.568500}, - {144060, 0.569375}, - {145530, 0.570250}, - {147000, 0.571125}, - {148470, 0.572000}, - {149940, 0.572875}, - {151410, 0.573750}, - {152880, 0.574625}, - {154350, 0.575500}, - {155820, 0.576375}, - {157290, 0.577250}, - {158760, 0.578125}, - {160230, 0.579000}, - {161700, 0.579875}, - {163170, 0.580750}, - {164640, 0.581625}, - {166110, 0.582500}, - {167580, 0.583375}, - {169050, 0.584250}, - {170520, 0.585125}, - {171990, 0.586000}, - {173460, 0.586875}, - {174930, 0.587750}, - {176400, 0.588625}, - {177870, 0.589500}, - {179340, 0.590375}, - {180810, 0.591250}, - {182280, 0.592125}, - {183750, 0.593000}, - {185220, 0.593875}, - {186690, 0.594750}, - {188160, 0.595625}, - {189630, 0.596500}, - {191100, 0.597375}, - {192570, 0.598250}, - {194040, 0.599125}, - {195510, 0.600000}, - {196980, 0.600187}, - {198450, 0.600375}, - {199920, 0.600562}, - {201390, 0.600749}, - {202860, 0.600936}, - {204330, 0.601124}, - {205800, 0.601311}, - {207270, 0.601498}, - {208740, 0.601685}, - {210210, 0.601873}, - {211680, 0.602060}, - {213150, 0.602247}, - {214620, 0.602434}, - {216090, 0.602622}, - {217560, 0.602809}, - {219030, 0.602996}, - {220500, 0.603184}, - {221970, 0.603371}, - {223440, 0.603558}, - {224910, 0.603745}, - {226380, 0.603933}, - {227850, 0.604120}, - {229320, 0.604307}, - {230790, 0.604494}, - {232260, 0.604682}, - {233730, 0.604869}, - {235200, 0.605056}, - {236670, 0.605243}, - {238140, 0.605431}, - {239610, 0.605618}, - {241080, 0.605805}, - {242550, 0.605993}, - {244020, 0.606180}, - {245490, 0.606367}, - {246960, 0.606554}, - {248430, 0.606742}, - {249900, 0.606929}, - {251370, 0.607116}, - {252840, 0.607303}, - {254310, 0.607491}, - {255780, 0.607678}, - {257250, 0.607865}, - {258720, 0.608052}, - {260190, 0.608240}, - {261660, 0.608427}, - {263130, 0.608614}, - {264600, 0.608801}, - {266070, 0.608989}, - {267540, 0.609176}, - {269010, 0.609363}, - {270480, 0.609551}, - {271950, 0.609738}, - {273420, 0.609925}, - {274890, 0.610112}, - {276360, 0.610300}, - {277830, 0.610487}, - {279300, 0.610674}, - {280770, 0.610861}, - {282240, 0.611049}, - {283710, 0.611236}, - {285180, 0.611423}, - {286650, 0.611610}, - {288120, 0.611798}, - {289590, 0.611985}, - {291060, 0.612172}, - {292530, 0.612360}, - {294000, 0.612547}, - {295470, 0.612734}, - {296940, 0.612921}, - {298410, 0.613109}, - {299880, 0.613296}, - {301350, 0.613483}, - {302820, 0.613670}, - {304290, 0.613858}, - {305760, 0.614045}, - {307230, 0.614232}, - {308700, 0.614419}, - {310170, 0.614607}, - {311640, 0.614794}, - {313110, 0.614981}, - {314580, 0.615169}, - {316050, 0.615356}, - {317520, 0.615543}, - {318990, 0.615730}, - {320460, 0.615918}, - {321930, 0.616105}, - {323400, 0.616292}, - {324870, 0.616479}, - {326340, 0.616667}, - {327810, 0.616854}, - {329280, 0.617041}, - {330750, 0.617228}, - {332220, 0.617416}, - {333690, 0.617603}, - {335160, 0.617790}, - {336630, 0.617978}, - {338100, 0.618165}, - {339570, 0.618352}, - {341040, 0.618539}, - {342510, 0.618727}, - {343980, 0.618914}, - {345450, 0.619101}, - {346920, 0.619288}, - {348390, 0.619476}, - {349860, 0.619663}, - {351330, 0.619850}, - {352800, 0.620037}, - {354270, 0.620225}, - {355740, 0.620412}, - {357210, 0.620599}, - {358680, 0.620787}, - {360150, 0.620974}, - {361620, 0.621161}, - {363090, 0.621348}, - {364560, 0.621536}, - {373380, 0.622659}, - {392490, 0.625094}, - {411600, 0.627528}, - {432180, 0.630150}, - {451290, 0.632584}, - {470400, 0.635019}, - {490980, 0.637640}, - {510090, 0.640075}, - {529200, 0.642509}, - {549780, 0.645131}, - {568890, 0.647566}, - {588000, 0.650000}, - {608580, 0.652622}, - {627690, 0.655056}, - {648270, 0.657678}, - {667380, 0.660112}, - {686490, 0.662547}, - {707070, 0.665169}, - {726180, 0.667603}, - {745290, 0.670037}, - {765870, 0.672659}, - {784980, 0.675094}, - {804090, 0.677528}, - {824670, 0.680150}, - {843780, 0.682584}, - {862890, 0.685019}, - {883470, 0.687640}, - {902580, 0.690075}, - {921690, 0.692509}, - {942270, 0.695131}, - {961380, 0.697566}, - {980490, 0.700000}, - {1005480, 0.702553}, - {1030470, 0.705105}, - {1053990, 0.707508}, - {1078980, 0.710060}, - {1103970, 0.712613}, - {1127490, 0.715015}, - {1152480, 0.717568}, - {1177470, 0.720120}, - {1200990, 0.722523}, - {1225980, 0.725075}, - {1250970, 0.727628}, - {1274490, 0.730030}, - {1299480, 0.732583}, - {1324470, 0.735135}, - {1347990, 0.737538}, - {1372980, 0.740090}, - {1397970, 0.742643}, - {1421490, 0.745045}, - {1446480, 0.747598}, - {1470000, 0.750000}, - {1494990, 0.752553}, - {1519980, 0.755105}, - {1543500, 0.757508}, - {1568490, 0.760060}, - {1593480, 0.762613}, - {1617000, 0.765015}, - {1641990, 0.767568}, - {1666980, 0.770120}, - {1690500, 0.772523}, - {1715490, 0.775075}, - {1740480, 0.777628}, - {1764000, 0.780030}, - {1788990, 0.782583}, - {1813980, 0.785135}, - {1837500, 0.787538}, - {1862490, 0.790090}, - {1887480, 0.792643}, - {1911000, 0.795045}, - {1935990, 0.797598}, - {1959510, 0.800000}, - {2033010, 0.802500}, - {2106510, 0.805000}, - {2180010, 0.807500}, - {2253510, 0.810000}, - {2327010, 0.812500}, - {2400510, 0.815000}, - {2474010, 0.817500}, - {2547510, 0.820000}, - {2621010, 0.822500}, - {2694510, 0.825000}, - {2768010, 0.827500}, - {2841510, 0.830000}, - {2915010, 0.832500}, - {2988510, 0.835000}, - {3062010, 0.837500}, - {3135510, 0.840000}, - {3209010, 0.842500}, - {3282510, 0.845000}, - {3356010, 0.847500}, - {3429510, 0.850000}, - {3503010, 0.852500}, - {3576510, 0.855000}, - {3650010, 0.857500}, - {3723510, 0.860000}, - {3797010, 0.862500}, - {3870510, 0.865000}, - {3944010, 0.867500}, - {4017510, 0.870000}, - {4091010, 0.872500}, - {4164510, 0.875000}, - {4238010, 0.877500}, - {4311510, 0.880000}, - {4385010, 0.882500}, - {4458510, 0.885000}, - {4532010, 0.887500}, - {4605510, 0.890000}, - {4679010, 0.892500}, - {4752510, 0.895000}, - {4826010, 0.897500}, - {4899510, 0.900000}, - {10291470, 0.902505}, - {10780980, 0.905003}, - {11270490, 0.907500}, - {11761470, 0.910005}, - {12250980, 0.912503}, - {12740490, 0.915000}, - {13231470, 0.917505}, - {13720980, 0.920003}, - {14210490, 0.922501}, - {14701470, 0.925006}, - {15190980, 0.927503}, - {15680490, 0.930001}, - {16171470, 0.932506}, - {16660980, 0.935003}, - {17150490, 0.937501}, - {17641470, 0.940006}, - {18130980, 0.942504}, - {18620490, 0.945001}, - {19111470, 0.947506}, - {19600980, 0.950004}, - {20090490, 0.952501}, - {20581470, 0.955006}, - {21070980, 0.957504}, - {21560490, 0.960002}, - {22051470, 0.962507}, - {22540980, 0.965004}, - {23030490, 0.967502}, - {23521470, 0.970007}, - {24010980, 0.972504}, - {24500490, 0.975002}, - {24991470, 0.977507}, - {25480980, 0.980005}, - {25970490, 0.982502}, - {26461470, 0.985007}, - {26950980, 0.987505}, - {27440490, 0.990002}, - {27931470, 0.992507}, - {28420980, 0.995005}, - {28910490, 0.997502}, - {29400000, 1.000000}, +/* The distribution below is not representative of any real work load; it + * is intended to maximize the likelihood that large requests starve. Run + * it at 100% network load for 30 seconds or more. + */ +dist_point_gen::weight dist_point_gen::starve[] = { + {100000, 100}, + {200000, 100}, + {300000, 100}, + {400000, 100}, + {500000, 100}, + {600000, 100}, + {700000, 100}, + {800000, 100}, + {900000, 100}, + {1000000, 50}, }; diff --git a/util/dist.h b/util/dist.h index fbf022d3..840a50cf 100644 --- a/util/dist.h +++ b/util/dist.h @@ -1,16 +1,5 @@ -/* Copyright (c) 2019-2023 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2019-2023 Homa Developers + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file defines the kernel contains information and supporting @@ -35,6 +24,7 @@ class dist_point_gen { double get_mean() const {return dist_mean;} double dist_overhead(int mtu) const; std::vector values() const; + std::vector cdf_fractions() const; /** * struct dist_point - Describes one point in a CDF of message lengths. @@ -70,5 +60,25 @@ class dist_point_gen { std::uniform_real_distribution uniform_dist; static int dist_msg_overhead(int length, int mtu); + + /** + * Used for entering raw data (weights instead of CDF, this is usually + * more convenient when entering distributions by hand). + */ + struct weight { + /** @length: message length, in bytes. */ + size_t length; + + /** + * @freq: relative frequency of messages of this length. + */ + double freq; + + weight(size_t length, double freq) + : length(length), freq(freq) + {} + }; + + static struct weight w1[], w2[], w3[], w4[], w5[], starve[]; }; #endif /* _DIST_H */ diff --git a/util/dist_test.cc b/util/dist_test.cc index 9dfa217f..df839513 100644 --- a/util/dist_test.cc +++ b/util/dist_test.cc @@ -1,16 +1,5 @@ -/* Copyright (c) 2023 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2023 Homa Developers + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ #include diff --git a/util/dist_to_proto.cc b/util/dist_to_proto.cc new file mode 100644 index 00000000..ccf35a53 --- /dev/null +++ b/util/dist_to_proto.cc @@ -0,0 +1,65 @@ +/* Copyright (c) 2023 Homa Developers + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + */ + +#include "dist.h" +extern "C" { +#include "homa.h" +} +#include "iostream" + +/** + * This program takes one of the five workload distributions and converts + * it into a fragment of a textformat protobuf used in distbench. It will first + * merge buckets and truncate cdf_point sizes according to command line + * arguments then write the cdf_points to stdout and the interval conversion + * to stderr. + * + * Usage: + * ./dist_to_proto workload [max message length] [min bucket frac] + * [max size ratio] [gigabits per second] + */ +int main (int argc, char**argv) +{ + int max_message_length = HOMA_MAX_MESSAGE_LENGTH; + double min_bucket_frac = 0.0025; + double max_size_ratio = 1.2; + double gbps = 20.0; + if (argc < 2) { + fprintf(stderr, "Usage: %s workload [max message length] " + "[min bucket frac] [max size ratio] [gbps]\n", + argv[0]); + exit(1); + } + if (argc > 2) { + max_message_length = atoi(argv[2]); + } + if (argc > 3) { + min_bucket_frac = std::stod(argv[3]); + } + if (argc > 4) { + max_size_ratio = std::stod(argv[4]); + } + if (argc > 5) { + gbps = std::stod(argv[5]); + } + + dist_point_gen generator(argv[1], max_message_length, + min_bucket_frac, max_size_ratio); + std::vector values = generator.values(); + std::vector fractions = generator.cdf_fractions(); + + for (size_t i = 0; i < values.size(); ++i) { + printf(" cdf_points { value: %d, cdf: %20.19f }\n", + values[i], fractions[i]); + } + + /** + * Convert average size to bits, then divide by gbps and round up to get + * nanoseconds, then multiply by 2 because request size and response + * size are equal + */ + double interval_ns = (std::ceil( (generator.get_mean() * 8.0) / gbps)) + * 2; + fprintf(stderr,"%.0f", interval_ns); +} \ No newline at end of file diff --git a/util/get_time_trace.c b/util/get_time_trace.c index 995b64fe..9cb11717 100644 --- a/util/get_time_trace.c +++ b/util/get_time_trace.c @@ -1,16 +1,5 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2019-2022 Homa Developers + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /** diff --git a/util/get_traces b/util/get_traces new file mode 100755 index 00000000..f166adab --- /dev/null +++ b/util/get_traces @@ -0,0 +1,26 @@ +#!/bin/bash + +# Copyright (c) 2023 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +# Usage: +# get_traces first last dst +# +# This script will retrieve node.tt timetraces from the home directory +# of the nodes with numbers from first to last, inclusive, and store them +# in files nodeN.tt in directory dst. + +if [ $# -ne 3 ]; then + echo "Usage: get_traces first last dst" + exit 1 +fi +first=$1 +last=$2 +dst=$3 + +for ((i = $first ; i <= $last; i++)); do + node=node$i + echo $node + mkdir -p $dst + cl ssh $node cat node.tt > $dst/$node.tt +done diff --git a/util/homa_prio.cc b/util/homa_prio.cc index c3575823..b33a6010 100644 --- a/util/homa_prio.cc +++ b/util/homa_prio.cc @@ -1,16 +1,5 @@ -/* Copyright (c) 2020-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2020-2022 Homa Developers + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file contains a program that dynamically adjusts Homa's allocation @@ -29,7 +18,9 @@ #include #include +extern "C" { #include "homa.h" +} /* Values of command-line arguments (and their default values): */ @@ -329,11 +320,10 @@ void read_metrics(const char *path, metrics *metrics) * unsched_cutoffs parameter. * @num_priorities: Total number of priorities available for Homa (including * both scheduled and unscheduled). - * @rtt_bytes: Homa's rtt_bytes parameter (i.e., the maximum number of - * unscheduled bytes in any message). + * @unsched_bytes: Homa's unsched_bytes parameter. */ void compute_cutoffs(metrics *diff, int cutoffs[8], int num_priorities, - int rtt_bytes) + int unsched_bytes) { int64_t total_bytes, total_unsched_bytes; int prev_size; @@ -348,11 +338,11 @@ void compute_cutoffs(metrics *diff, int cutoffs[8], int num_priorities, "by %d\n", prev_size, interval.max_size); total_bytes += interval.total_bytes; - if (interval.max_size <= rtt_bytes) + if (interval.max_size <= unsched_bytes) interval.unsched_bytes = interval.total_bytes; else { interval.unsched_bytes = interval.total_messages - * rtt_bytes; + * unsched_bytes; if (interval.unsched_bytes > interval.total_bytes) interval.unsched_bytes = interval.total_bytes; } @@ -360,7 +350,7 @@ void compute_cutoffs(metrics *diff, int cutoffs[8], int num_priorities, prev_size = interval.max_size; } total_bytes += diff->large_msg_bytes; - total_unsched_bytes += diff->large_msg_count * rtt_bytes; + total_unsched_bytes += diff->large_msg_count * unsched_bytes; // Divide priorities between scheduled and unscheduled packets. int64_t unsched_prios = unsched; @@ -656,7 +646,7 @@ int main(int argc, const char** argv) int prev_deciles[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; int cutoffs[8]; int num_priorities = 1; - int rtt_bytes = 0; + int unsched_bytes = 0; int prev_num_priorities = -1; while (1) { usleep(1000*reconfig_interval); @@ -673,8 +663,8 @@ int main(int argc, const char** argv) log(NORMAL, "get_param failed for num_priorities\n"); continue; } - if (!get_param("rtt_bytes", &rtt_bytes)) { - log(NORMAL, "get_param failed for rtt_bytes\n"); + if (!get_param("unsched_bytes", &unsched_bytes)) { + log(NORMAL, "get_param failed for unsched_bytes\n"); continue; } @@ -716,7 +706,7 @@ int main(int argc, const char** argv) drift, min_drift); continue; } - compute_cutoffs(&diff, cutoffs, num_priorities, rtt_bytes); + compute_cutoffs(&diff, cutoffs, num_priorities, unsched_bytes); log(NORMAL, "Decile drift %.2f, best cutoffs: %d %d %d %d " "%d %d %d %d\n", drift, cutoffs[0], cutoffs[1], cutoffs[2], diff --git a/util/homa_test.cc b/util/homa_test.cc index fc3d2ebb..5546089f 100644 --- a/util/homa_test.cc +++ b/util/homa_test.cc @@ -1,16 +1,5 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2019-2022 Homa Developers + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ // This file contains a collection of tests for the Linux implementation @@ -39,7 +28,9 @@ #include +extern "C" { #include "homa.h" +} #include "test_utils.h" /* Determines message size in bytes for tests. */ @@ -57,7 +48,11 @@ char *buf_region; /* Either AF_INET or AF_INET6: indicates whether to use IPv6 instead of IPv4. */ int inet_family = AF_INET; -/* Control blocks for receiving messages. */ +/* Control blocks for receiving messages. Reusing the same + * homa_recvmsg_args causes receive buffers to be returned to Homa + * automatically. Each call to recvmsg returns the buffers from the + * previous call. + */ struct homa_recvmsg_args recv_args; struct msghdr recv_hdr; @@ -88,16 +83,21 @@ void close_fd(int fd) */ void send_fd(int fd, const sockaddr_in_union *addr, char *request) { - uint64_t id; + struct homa_sendmsg_args homa_args; + struct msghdr msghdr; + struct iovec iov; int status; sleep(1); - status = homa_send(fd, request, length, addr, &id, 0); + iov.iov_base = request; + iov.iov_len = length; + init_sendmsg_hdrs(&msghdr, &homa_args, &iov, 1, &addr->sa, + sockaddr_size(&addr->sa)); + status = sendmsg(fd, &msghdr, 0); if (status < 0) { - printf("Error in homa_send: %s\n", - strerror(errno)); + printf("Error in sendmsg %s\n", strerror(errno)); } else { - printf("homa_send succeeded, id %lu\n", id); + printf("sendmsg succeeded, id %llu\n", homa_args.id); } } @@ -150,7 +150,7 @@ void test_close() } std::thread thread(close_fd, fd); recv_args.id = 0; - recv_args.flags = HOMA_RECVMSG_RESPONSE; + recv_hdr.msg_controllen = sizeof(recv_args); result = recvmsg(fd, &recv_hdr, 0); if (result > 0) { printf("Received %d bytes\n", result); @@ -169,18 +169,25 @@ void test_close() */ void test_fill_memory(int fd, const sockaddr_in_union *dest, char *request) { - uint64_t id; - int status; + struct homa_sendmsg_args homa_args; + uint64_t start = rdtsc(); + struct msghdr msghdr; int completed = 0; size_t total = 0; -#define PRINT_INTERVAL 1000 ssize_t received; - uint64_t start = rdtsc(); + struct iovec iov; + int status; + +#define PRINT_INTERVAL 1000 + iov.iov_base = request; + iov.iov_len = 1; + init_sendmsg_hdrs(&msghdr, &homa_args, &iov, 1, &dest->sa, + sockaddr_size(&dest->sa)); for (int i = 1; i <= count; i++) { - status = homa_send(fd, request, length, dest, &id, 0); + status = sendmsg(fd, &msghdr, 0); if (status < 0) { - printf("Error in homa_send: %s\n", + printf("Error in sendmsg: %s\n", strerror(errno)); sleep(1); } @@ -192,11 +199,11 @@ void test_fill_memory(int fd, const sockaddr_in_union *dest, char *request) total = 0; for (int i = 1; i <= count; i++) { recv_args.id = 0; - recv_args.flags = HOMA_RECVMSG_RESPONSE; + recv_hdr.msg_controllen = sizeof(recv_args); received = recvmsg(fd, &recv_hdr, 0); if (received < 0) { - printf("Error in recvmsg for id %lu: %s\n", - id, strerror(errno)); + printf("Error in recvmsg for id %llu: %s\n", + recv_args.id, strerror(errno)); } else { total += received; completed++; @@ -221,19 +228,25 @@ void test_fill_memory(int fd, const sockaddr_in_union *dest, char *request) */ void test_invoke(int fd, const sockaddr_in_union *dest, char *request) { - uint64_t id; - int status; + struct homa_sendmsg_args homa_args; + struct msghdr msghdr; ssize_t resp_length; + struct iovec iov; + int status; - status = homa_send(fd, request, length, dest, &id, 0); + iov.iov_base = request; + iov.iov_len = length; + init_sendmsg_hdrs(&msghdr, &homa_args, &iov, 1, &dest->sa, + sockaddr_size(&dest->sa)); + status = sendmsg(fd, &msghdr, 0); if (status < 0) { - printf("Error in homa_send: %s\n", strerror(errno)); + printf("Error in sendmsg: %s\n", strerror(errno)); return; } else { - printf("homa_send succeeded, id %lu\n", id); + printf("sendmsg succeeded, id %llu\n", homa_args.id); } recv_args.id = 0; - recv_args.flags = HOMA_RECVMSG_RESPONSE; + recv_hdr.msg_controllen = sizeof(recv_args); resp_length = recvmsg(fd, &recv_hdr, 0); if (resp_length < 0) { printf("Error in recvmsg: %s\n", strerror(errno)); @@ -242,7 +255,7 @@ void test_invoke(int fd, const sockaddr_in_union *dest, char *request) int seed = check_message(&recv_args, buf_region, resp_length, 2*sizeof32(int)); printf("Received message from %s with %lu bytes, " - "seed %d, id %lu\n", + "seed %d, id %llu\n", print_address(&source_addr), resp_length, seed, recv_args.id); } @@ -325,7 +338,7 @@ void test_poll(int fd, char *request) } recv_args.id = 0; - recv_args.flags = HOMA_RECVMSG_REQUEST; + recv_hdr.msg_controllen = sizeof(recv_args); result = recvmsg(fd, &recv_hdr, 0); if (result < 0) printf("Error in recvmsg: %s\n", strerror(errno)); @@ -334,6 +347,56 @@ void test_poll(int fd, char *request) result, ntohs(source_addr.in4.sin_port)); } +/** + * test_private() - Send several private requests and wait for responses in + * the reverse order. + * @fd: Homa socket. + * @dest: Where to send the request. + * @request: Request message. + */ +void test_private(int fd, const sockaddr_in_union *dest, char *request) +{ + struct homa_sendmsg_args homa_args; + struct msghdr msghdr; + ssize_t resp_length; + struct iovec iov; + int status, i; + __u64 ids[3]; + + iov.iov_base = request; + iov.iov_len = length; + init_sendmsg_hdrs(&msghdr, &homa_args, &iov, 1, &dest->sa, + sockaddr_size(&dest->sa)); + homa_args.flags = HOMA_SENDMSG_PRIVATE; + for (i = 0; i < 3; i++) { + homa_args.id = 0; + status = sendmsg(fd, &msghdr, 0); + ids[i] = homa_args.id; + if (status < 0) { + printf("Error in sendmsg: %s\n", strerror(errno)); + return; + } else { + printf("sendmsg succeeded, id %llu\n", ids[i]); + } + } + + for (i = 2; i >= 0; i--) { + recv_args.id = ids[i]; + recv_hdr.msg_controllen = sizeof(recv_args); + resp_length = recvmsg(fd, &recv_hdr, 0); + if (resp_length < 0) { + printf("Error in recvmsg: %s\n", strerror(errno)); + return; + } + int seed = check_message(&recv_args, buf_region, resp_length, + 2*sizeof32(int)); + printf("Received message from %s with %lu bytes, " + "seed %d, id %llu\n", + print_address(&source_addr), resp_length, seed, + recv_args.id); + } +} + /** * test_read() - Measure round-trip time for a read kernel call that * does nothing but return an error. @@ -370,21 +433,27 @@ void test_read(int fd, int count) */ void test_rtt(int fd, const sockaddr_in_union *dest, char *request) { - int status; + uint64_t *times = new uint64_t[count]; + struct homa_sendmsg_args homa_args; + struct msghdr msghdr; ssize_t resp_length; + struct iovec iov; uint64_t start; - uint64_t *times = new uint64_t[count]; + int status; + iov.iov_base = request; + iov.iov_len = length; for (int i = -10; i < count; i++) { start = rdtsc(); - status = homa_send(fd, request, length, dest, NULL, 0); + init_sendmsg_hdrs(&msghdr, &homa_args, &iov, 1, &dest->sa, + sockaddr_size(&dest->sa)); + status = sendmsg(fd, &msghdr, 0); if (status < 0) { - printf("Error in homa_send: %s\n", - strerror(errno)); + printf("Error in sendmsg: %s\n", strerror(errno)); return; } recv_args.id = 0; - recv_args.flags = HOMA_RECVMSG_RESPONSE; + recv_hdr.msg_controllen = sizeof(recv_args); resp_length = recvmsg(fd, &recv_hdr, 0); if (i >= 0) times[i] = rdtsc() - start; @@ -410,41 +479,51 @@ void test_rtt(int fd, const sockaddr_in_union *dest, char *request) */ void test_send(int fd, const sockaddr_in_union *dest, char *request) { - uint64_t id; + struct homa_sendmsg_args homa_args; + struct msghdr msghdr; + struct iovec iov; int status; - status = homa_send(fd, request, length, dest, &id, 0); + iov.iov_base = request; + iov.iov_len = length; + init_sendmsg_hdrs(&msghdr, &homa_args, &iov, 1, &dest->sa, + sockaddr_size(&dest->sa)); + status = sendmsg(fd, &msghdr, 0); if (status < 0) { - printf("Error in homa_send: %s\n", - strerror(errno)); + printf("Error in sendmsg: %s\n", strerror(errno)); } else { - printf("Homa_send succeeded, id %lu\n", id); + printf("sendmsg succeeded, id %llu\n", homa_args.id); } } /** * test_set_buf() - Invoke homa_set_buf on a Homa socket. - * @fd: Homa socket. */ -void test_set_buf(int fd) +void test_set_buf(void) { - int status; + int status, fd; char *region = (char *) mmap(NULL, 64*HOMA_BPAGE_SIZE, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); - struct homa_set_buf_args arg; + struct homa_rcvbuf_args arg; if (region == MAP_FAILED) { printf("Couldn't mmap buffer region: %s\n", strerror(errno)); return; } - arg.start = region; + fd = socket(inet_family, SOCK_DGRAM, IPPROTO_HOMA); + if (fd < 0) { + printf("Couldn't open Homa socket: %s\n", strerror(errno)); + } + + arg.start = (uintptr_t)region; arg.length = 64*HOMA_BPAGE_SIZE; - status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_SET_BUF, &arg, + status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_RCVBUF, &arg, sizeof(arg)); if (status < 0) - printf("Error in setsockopt(SO_HOMA_SET_BUF): %s\n", + printf("Error in setsockopt(SO_HOMA_RCVBUF): %s\n", strerror(errno)); + close(fd); } /** @@ -458,7 +537,7 @@ void test_shutdown(int fd) std::thread thread(shutdown_fd, fd); thread.detach(); recv_args.id = 0; - recv_args.flags = HOMA_RECVMSG_RESPONSE; + recv_hdr.msg_controllen = sizeof(recv_args); result = recvmsg(fd, &recv_hdr, 0); if (result > 0) { printf("Received %d bytes\n", result); @@ -469,7 +548,7 @@ void test_shutdown(int fd) /* Make sure that future reads also fail. */ recv_args.id = 0; - recv_args.flags = HOMA_RECVMSG_RESPONSE; + recv_hdr.msg_controllen = sizeof(recv_args); result = recvmsg(fd, &recv_hdr, 0); if (result < 0) { printf("Second recvmsg call also failed: %s\n", @@ -489,14 +568,16 @@ void test_shutdown(int fd) void test_stream(int fd, const sockaddr_in_union *dest) { #define MAX_RPCS 100 + struct homa_sendmsg_args homa_args; + uint64_t end_cycles, end_time; + uint64_t start_cycles = 0; + int64_t start_bytes = 0; int *buffers[MAX_RPCS]; + int64_t bytes_sent = 0; + struct msghdr msghdr; ssize_t resp_length; - uint64_t id, end_cycles; - uint64_t start_cycles = 0; - uint64_t end_time; + struct iovec iov; int status, i; - int64_t bytes_sent = 0; - int64_t start_bytes = 0; double rate; end_time = rdtsc() + (uint64_t) (5*get_cycles_per_sec()); @@ -512,10 +593,14 @@ void test_stream(int fd, const sockaddr_in_union *dest) buffers[i][1] = 12; seed_buffer(buffers[i]+2, length - 2*sizeof32(int), 1000*i); } + iov.iov_len = length; for (i = 0; i < count; i++) { - status = homa_send(fd, buffers[i], length, dest, &id, 0); + iov.iov_base = buffers[i]; + init_sendmsg_hdrs(&msghdr, &homa_args, &iov, 1, &dest->sa, + sockaddr_size(&dest->sa)); + status = sendmsg(fd, &msghdr, 0); if (status < 0) { - printf("Error in homa_send: %s\n", strerror(errno)); + printf("Error in sendmsg: %s\n", strerror(errno)); return; } } @@ -524,25 +609,26 @@ void test_stream(int fd, const sockaddr_in_union *dest) * response to an outstanding request, then initiates a new * request. */ - while (1){ + while (1) { int *response; recv_args.id = 0; - recv_args.flags = HOMA_RECVMSG_RESPONSE; + recv_hdr.msg_controllen = sizeof(recv_args); resp_length = recvmsg(fd, &recv_hdr, 0); if (resp_length < 0) { - printf("Error in recvmsg: %s\n", - strerror(errno)); + printf("Error in recvmsg: %s\n", strerror(errno)); return; } if (resp_length != 12) printf("Expected 12 bytes in response, received %ld\n", resp_length); response = (int *) (buf_region + recv_args.bpage_offsets[0]); - status = homa_send(fd, buffers[(response[2]/1000) %count], - length, dest, &id, 0); + iov.iov_base = buffers[(response[2]/1000) % count]; + init_sendmsg_hdrs(&msghdr, &homa_args, &iov, 1, &dest->sa, + sockaddr_size(&dest->sa)); + status = sendmsg(fd, &msghdr, 0); if (status < 0) { - printf("Error in homa_send: %s\n", strerror(errno)); + printf("Error in sendmsg: %s\n", strerror(errno)); return; } bytes_sent += length; @@ -770,8 +856,6 @@ void test_tmp(int fd, int count) h.msg_controllen = sizeof(control); memset(&control, 0, sizeof(control)); - control.flags = HOMA_RECVMSG_REQUEST | HOMA_RECVMSG_REQUEST - | HOMA_RECVMSG_NONBLOCKING; int result = recvmsg(fd, &h, 0); printf("recvmsg returned %d, addr %p, namelen %d, control %p, " @@ -815,6 +899,108 @@ void test_udpclose() } } +/* Receive one message every second. After a few messages have been + * received, shut down the socket to make sure that the wmem waiting + * mechanism aborts properly. + */ +void recv_slow(int fd) +{ + int status; + + for (int i = 0; i < 15; i++) { + sleep(1); + recv_args.id = 0; + recv_hdr.msg_controllen = sizeof(recv_args); + status = recvmsg(fd, &recv_hdr, 0); + if (status < 0) { + printf("Receiver exiting: %s\n", strerror(errno)); + return; + } + printf("Received response %d with %d bytes\n", i, status); + } + printf("Receiver shutting down socket\n"); + shutdown(fd, 0); +} + +/** + * test_wmem() - Use two threads, a sender and a receiver, and make the + * receiver go so slowly that the sender uses up all available tx packet + * memory and blocks. Note: specify a large --length parameter. + * @fd: Homa socket. + * @dest: Where to send the request + * @request: Request message. + */ +void test_wmem(int fd, const sockaddr_in_union *dest, char *request) +{ + struct homa_sendmsg_args homa_args; + struct msghdr msghdr; + struct iovec iov; + int status; + + std::thread thread(recv_slow, fd); + + iov.iov_base = request; + iov.iov_len = length; + for (int i = 0; i < count; i++) { + init_sendmsg_hdrs(&msghdr, &homa_args, &iov, 1, &dest->sa, + sockaddr_size(&dest->sa)); + status = sendmsg(fd, &msghdr, 0); + if (status < 0) { + printf("Error in sendmsg: %s\n", strerror(errno)); + break; + } + printf("Sent request %d with %d bytes\n", i, length); + } + printf("Sender shutting down socket\n"); + shutdown(fd, 0); + thread.join(); +} + +/** + * test_wmem_poll() - Use two threads, a sender and a receiver, and make the + * receiver go so slowly that the sender uses up all available tx packet + * memory and blocks. On the sender, use poll to wait for tx packet memory. + * @fd: Homa socket. + * @dest: Where to send the request + * @request: Request message. + */ +void test_wmem_poll(int fd, const sockaddr_in_union *dest, char *request) +{ + struct homa_sendmsg_args homa_args; + struct pollfd poll_info = { + .fd = fd, + .events = POLLOUT, + .revents = 0 + }; + struct msghdr msghdr; + struct iovec iov; + int status; + + std::thread thread(recv_slow, fd); + + iov.iov_base = request; + iov.iov_len = length; + for ( ; count > 0; count--) { + status = poll(&poll_info, 1, -1); + if (status > 0) { + printf("Poll succeeded with mask 0x%x\n", poll_info.revents); + } else { + printf("Poll failed: %s\n", strerror(errno)); + break; + } + init_sendmsg_hdrs(&msghdr, &homa_args, &iov, 1, &dest->sa, + sockaddr_size(&dest->sa)); + status = sendmsg(fd, &msghdr, MSG_DONTWAIT); + if (status < 0) { + printf("Error in sendmsg: %s\n", strerror(errno)); + break; + } + printf("Sent request with %d bytes\n", length); + } + shutdown(fd, 0); + thread.join(); +} + int main(int argc, char** argv) { int fd, status, port, next_arg; @@ -921,18 +1107,17 @@ int main(int argc, char** argv) printf("Couldn't mmap buffer region: %s\n", strerror(errno)); exit(1); } - struct homa_set_buf_args arg; - arg.start = buf_region; + struct homa_rcvbuf_args arg; + arg.start = (uintptr_t)buf_region; arg.length = 1000*HOMA_BPAGE_SIZE; - status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_SET_BUF, &arg, + status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_RCVBUF, &arg, sizeof(arg)); if (status < 0) { - printf("Error in setsockopt(SO_HOMA_SET_BUF): %s\n", + printf("Error in setsockopt(SO_HOMA_RCVBUF): %s\n", strerror(errno)); exit(1); } recv_args.id = 0; - recv_args.flags = 0; recv_args.num_bpages = 0; recv_hdr.msg_name = &source_addr; recv_hdr.msg_namelen = sizeof32(source_addr); @@ -953,6 +1138,8 @@ int main(int argc, char** argv) test_ioctl(fd, count); } else if (strcmp(argv[next_arg], "poll") == 0) { test_poll(fd, buffer); + } else if (strcmp(argv[next_arg], "private") == 0) { + test_private(fd, &dest, buffer); } else if (strcmp(argv[next_arg], "send") == 0) { test_send(fd, &dest, buffer); } else if (strcmp(argv[next_arg], "read") == 0) { @@ -962,7 +1149,7 @@ int main(int argc, char** argv) } else if (strcmp(argv[next_arg], "shutdown") == 0) { test_shutdown(fd); } else if (strcmp(argv[next_arg], "set_buf") == 0) { - test_set_buf(fd); + test_set_buf(); } else if (strcmp(argv[next_arg], "stream") == 0) { test_stream(fd, &dest); } else if (strcmp(argv[next_arg], "tcp") == 0) { @@ -973,6 +1160,10 @@ int main(int argc, char** argv) test_tmp(fd, count); } else if (strcmp(argv[next_arg], "udpclose") == 0) { test_udpclose(); + } else if (strcmp(argv[next_arg], "wmem") == 0) { + test_wmem(fd, &dest, buffer); + } else if (strcmp(argv[next_arg], "wmem_poll") == 0) { + test_wmem_poll(fd, &dest, buffer); } else { printf("Unknown operation '%s'\n", argv[next_arg]); exit(1); diff --git a/util/inc_tput.cc b/util/inc_tput.cc new file mode 100644 index 00000000..2bf31bd3 --- /dev/null +++ b/util/inc_tput.cc @@ -0,0 +1,88 @@ +/* Copyright (c) 2024 Homa Developers + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + */ + +/* This program measures the throughput of atomic increments in the face + * of many concurrent cores invoking it. + */ + +#include + +#include +#include +#include + +#include +#include +#include + +std::atomic_int value = 0; +std::vector thread_counts; + +/** + * rdtsc(): return the current value of the fine-grain CPU cycle counter + * (accessed via the RDTSC instruction). + */ +inline static uint64_t rdtsc(void) +{ + uint32_t lo, hi; + __asm__ __volatile__("rdtsc" : "=a" (lo), "=d" (hi)); + return (((uint64_t)hi << 32) | lo); +} + +void increment(int index) +{ + while (1) { + value.fetch_add(1); + thread_counts[index]++; + } +} + +int main(int argc, char** argv) +{ + int num_threads = 1; + int i; + std::vector old_counts; + + if (argc == 2) { + char *end; + num_threads = strtol(argv[1], &end, 0); + if (*end != 0) { + printf("Illegal argument %s: must be integer\n", + argv[1]); + exit(1); + } + } else if (argc != 1) { + printf("Usage: %s [num_threads]\n", argv[0]); + } + + for (i = 0; i < num_threads; i++) { + thread_counts.emplace_back(0); + old_counts.emplace_back(0); + new std::thread(increment, i); + } + + struct timeval prev_time, cur_time; + gettimeofday(&prev_time, nullptr); + uint64_t old_value = value; + while (1) { + sleep(1); + gettimeofday(&cur_time, nullptr); + uint64_t new_value = value; + double diff = new_value - old_value; + double secs = cur_time.tv_sec - prev_time.tv_sec; + secs += 1e-6*(cur_time.tv_usec - prev_time.tv_usec); + printf("%.2f Mops/sec [", (diff/secs)*1e-6); + const char *sep = ""; + for (i = 0; i < num_threads; i++) { + int new_count = thread_counts[i]; + diff = new_count - old_counts[i]; + printf("%s%.2f", sep, (diff/secs)*1e-6); + sep = " "; + old_counts[i] = new_count; + } + printf("]\n"); + prev_time = cur_time; + old_value = new_value; + } +} \ No newline at end of file diff --git a/util/metrics.py b/util/metrics.py index 7e5b8fce..5662e859 100755 --- a/util/metrics.py +++ b/util/metrics.py @@ -1,18 +1,7 @@ #!/usr/bin/python3 -# Copyright (c) 2019-2022 Stanford University -# -# Permission to use, copy, modify, and distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2019-2023 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ Reads Homa metrics from the kernel and prints out anything that is changed @@ -97,7 +86,9 @@ def scale_number(number): suffix to keep the number small and readable """ - if number > 1000000: + if number > 1e9: + return "%5.1f G" % (number/1e9) + elif number > 1000000: return "%5.1f M" % (number/1000000.0) elif (number > 1000): return "%5.1f K" % (number/1000.0) @@ -124,7 +115,7 @@ def scale_number(number): # Sum all of the individual core counts for both the new and old data and # compute the difference in "deltas" for symbol in symbols: - if (symbol == "rdtsc_cycles") or (symbol == "cpu_khz") or (symbol == "core"): + if (symbol == "time_cycles") or (symbol == "cpu_khz") or (symbol == "core"): # This symbol shouldn't be summed. continue total_cur = 0 @@ -132,7 +123,8 @@ def scale_number(number): total_cur += core[symbol] total_prev = 0 for core in prev: - total_prev += core[symbol] + if symbol in core: + total_prev += core[symbol] delta = total_cur - total_prev deltas[symbol] = delta @@ -146,22 +138,19 @@ def scale_number(number): cpu_khz = float(cur[0]["cpu_khz"]) if len(prev) > 0: - time_delta = cur[0]["rdtsc_cycles"] - prev[0]["rdtsc_cycles"] + time_delta = cur[0]["time_cycles"] - prev[0]["time_cycles"] elapsed_secs = float(time_delta)/(cpu_khz * 1000.0) pad = pad.ljust(13) secs = "(%.1f s)" % (elapsed_secs) secs = secs.ljust(12) - print("%-28s %15d %s %s" % ("rdtsc_cycles", time_delta, secs, - docs["rdtsc_cycles"])) + print("%-30s %15d %s %s" % ("time_cycles", time_delta, secs, + docs["time_cycles"])) else: - print("%-15s %28d %s%s" % ("rdtsc_cycles", cur[0]["rdtsc_cycles"], - "", docs["rdtsc_cycles"])) - -print("%-28s %5.2f %sCPU clock rate (GHz)" % ("clock_rate", - cpu_khz/1e06, pad)) + print("%-17s %28d %s%s" % ("time_cycles", cur[0]["time_cycles"], + "", docs["time_cycles"])) for symbol in symbols: - if (symbol == "rdtsc_cycles") or (symbol == "cpu_khz"): + if (symbol == "time_cycles") or (symbol == "cpu_khz"): # This symbol is handled specially above continue delta = deltas[symbol] @@ -176,7 +165,7 @@ def scale_number(number): if symbol.endswith("_cycles") and (time_delta != 0): percent = "(%.1f%%)" % (100.0*delta/time_delta) percent = percent.ljust(12) - print("%-28s %15d %s %s" % (symbol, delta, percent, doc)) + print("%-30s %15d %s %s" % (symbol, delta, percent, doc)) elif symbol.endswith("_queued") and (time_delta != 0): received = deltas[symbol[:-7] + "_received"] if received != 0: @@ -184,31 +173,36 @@ def scale_number(number): else: percent = " " percent = percent.ljust(12) - print("%-28s %15d %s %s" % (symbol, delta, percent, doc)) + print("%-30s %15d %s %s" % (symbol, delta, percent, doc)) else: - print("%-28s %15d %s%s" % (symbol, delta, rate_info, doc)) + print("%-30s %15d %s%s" % (symbol, delta, rate_info, doc)) if symbol.startswith("packets_rcvd_"): total_packets += delta if symbol == "softirq_calls": gro_packets = delta if (symbol == "reaper_dead_skbs") and ("reaper_calls" in deltas): - print("%-28s %6.1f %sAvg. hsk->dead_skbs in reaper" % ( + print("%-30s %6.1f %sAvg. hsk->dead_skbs in reaper" % ( "avg_dead_skbs", delta/deltas["reaper_calls"], pad)) if symbol.endswith("_miss_cycles") and (time_delta != 0): prefix = symbol[:-12] - if (prefix + "_misses") in deltas: + if ((prefix + "_misses") in deltas) and (deltas[prefix + "_misses"] != 0): ns = (delta/deltas[prefix + "_misses"])/(cpu_khz * 1e-06) - print("%-28s %6.1f %sAvg. wait time per %s miss (ns)" % ( + print("%-30s %6.1f %sAvg. wait time per %s miss (ns)" % ( prefix + "_miss_delay", ns, pad, prefix)) if (symbol == "large_msg_bytes") and (total_received_bytes != 0) \ and (time_delta != 0): rate = float(total_received_bytes)/elapsed_secs rate_info = ("(%s/s) " % (scale_number(rate))).ljust(13) - print("%-28s %15d %s%s" % ("received_msg_bytes", total_received_bytes, + print("%-30s %15d %s%s" % ("received_msg_bytes", total_received_bytes, rate_info, "Total bytes in all incoming messages")) if gro_packets != 0: - print("%-28s %6.2f %sHoma packets per homa_softirq call" % ( + print("%-30s %6.2f %sHoma packets per homa_softirq call" % ( "gro_benefit", float(total_packets)/float(gro_packets), pad)) +avg_grantable_rpcs = 0.0 +if ("grantable_rpcs_integral" in deltas) and (time_delta != 0): + avg_grantable_rpcs = float(deltas["grantable_rpcs_integral"])/time_delta + print("%-30s %6.2f %sAverage number of grantable incoming RPCs" % ( + "avg_grantable_rpcs", avg_grantable_rpcs, pad)) if elapsed_secs != 0: print("\nPer-Core CPU Usage:") @@ -228,8 +222,7 @@ def scale_number(number): for core in range(first_core, end_core): line += " Core%-2d" % (core) print(line) - for where in ["napi", "softirq", "send", "recv", "reply", - "timer", "pacer"]: + for where in ["napi", "softirq", "send", "recv", "reply", "timer"]: if where == "softirq": symbol = "linux_softirq_cycles" else: @@ -248,6 +241,14 @@ def scale_number(number): busiest_core = core print(line) print("\nBusiest core: %d (%.2f)" % (busiest_core, totals[busiest_core])) + other_busy = "" + for core in range(0, num_cores): + if (totals[core] >= 0.8) and (core != busiest_core): + if other_busy: + other_busy += ", " + other_busy += " %d (%.2f)" % (core, totals[core]); + if other_busy: + print("Other busy cores: %s" % (other_busy)) packets_received = 0.0 packets_sent = 0.0 @@ -303,7 +304,7 @@ def scale_number(number): cores = cpu_time/time_delta if packets_received > 0: print("%s %6.2f %7.2f us/packet" % (print_name.ljust(22), - cores, (cpu_time/packets_received) / (cpu_khz/1e03))) + cores, (cpu_time/(cpu_khz*1000)/packets_received) * 1e6)) else: print("%s %6.2f" % (print_name.ljust(22), cores)) cpu_time = float(deltas["napi_cycles"]) @@ -325,29 +326,36 @@ def scale_number(number): print("----------------------------------") print("Total Core Utilization %6.2f" % (total_cores_used)) - cpu_time = float(deltas["grant_cycles"]) - cores = cpu_time/time_delta - num_grants = deltas["packets_sent_GRANT"] - if num_grants != 0: - per_grant = cpu_time/float(deltas["packets_sent_GRANT"])/(cpu_khz/1e03) - else: - per_grant = 0.0 - print("\nSending grants %6.2f %7.2f us/grant" % (cores, - per_grant)) - time = float(deltas["poll_cycles"]) cores = time/time_delta calls = float(deltas["recv_calls"]) if calls == 0: us_per = 0 else: - us_per = (time/calls)/(cpu_khz/1e03) + us_per = (time/calls)/(cpu_khz*1e-3) + print("") print("Polling in recv %6.2f %7.2f us/syscall" % (cores, us_per)) + calls = deltas["skb_allocs"] + if calls == 0: + us_per = 0 + else: + us_per = (deltas["skb_alloc_cycles"]/calls)/(cpu_khz*1e-3) + print("Skb allocation %6.2f %7.2f us/skb" % ( + deltas["skb_alloc_cycles"]/time_delta, us_per)) + + calls = deltas["skb_frees"] + if calls == 0: + us_per = 0 + else: + us_per = (deltas["skb_free_cycles"]/calls)/(cpu_khz*1e-3) + print("Skb freeing %6.2f %7.2f us/skb" % ( + deltas["skb_free_cycles"]/time_delta, us_per)) + print("\nLock Misses:") print("------------") print(" Misses/sec. ns/Miss %CPU") - for lock in ["client", "socket", "grantable", "throttle", "peer_ack"]: + for lock in ["client", "server", "socket", "grant", "throttle", "peer_ack"]: misses = float(deltas[lock + "_lock_misses"]) cycles = float(deltas[lock + "_lock_miss_cycles"]) if misses == 0: @@ -363,73 +371,132 @@ def scale_number(number): if total_messages > 0.0: print("\nReceiving Messages:") print("-------------------") - poll_percent = 100.0*float(deltas["fast_wakeups"])/total_messages - sleep_percent = 100.0*float(deltas["slow_wakeups"])/total_messages - print("Available immediately: %4.1f%%" % (100.0 - poll_percent - - sleep_percent)) - print("Arrived while polling: %4.1f%%" % (poll_percent)) - print("Blocked at least once: %4.1f%%" % (sleep_percent)) + avail_percent = 100.0*float(deltas["wait_none"])/total_messages + poll_percent = 100.0*float(deltas["wait_fast"])/total_messages + sleep_percent = 100.0*float(deltas["wait_block"])/total_messages + if deltas["gen3_alt_handoffs"]: + gen3_alt_percent = (100.0*deltas["gen3_alt_handoffs"] + /deltas["gen3_handoffs"]) + else: + gen3_alt_percent = 0.0 + if deltas["handoffs_alt_thread"]: + alt_thread_percent = (100.0*deltas["handoffs_alt_thread"] + /deltas["handoffs_thread_waiting"]) + else: + alt_thread_percent = 0.0 + if deltas["packets_rcvd_DATA"]: + data_bypass_percent = (100.0*deltas["gro_data_bypasses"] + /deltas["packets_rcvd_DATA"]) + else: + data_bypass_percent = 0.0 + if deltas["packets_rcvd_GRANT"]: + grant_bypass_percent = (100.0*deltas["gro_grant_bypasses"] + /deltas["packets_rcvd_GRANT"]) + else: + grant_bypass_percent = 0.0 + print("Available immediately: %5.1f%%" % (avail_percent)) + print("Arrived while polling: %5.1f%%" % (poll_percent)) + print("Blocked at least once: %5.1f%%" % (sleep_percent)) + print("Alternate GRO handoffs: %5.1f%%" % (gen3_alt_percent)) + print("Alternate thread handoffs: %5.1f%%" % (alt_thread_percent)) + print("GRO bypass for data packets: %5.1f%%" % (data_bypass_percent)) + print("GRO bypass for grant packets: %5.1f%%" % (grant_bypass_percent)) + + pacer_bytes = deltas["pacer_homa_bytes"] + deltas["pacer_tcp_bytes"] + if pacer_bytes != 0: + print("\nPacer:") + print("--------") + if packets_sent > 0: + print("Homa packets sent: %5.3f M/sec (%.1f %% of all Homa packets)" % + (1e-6*deltas["pacer_homa_packets"]/elapsed_secs, + 100*deltas["pacer_homa_packets"]/packets_sent)) + else: + print("Homa packets sent: 0.000 M/sec") + print("Homa throughput (inc. headers): %5.2f Gbps" % + (8e-9*deltas["pacer_homa_bytes"]/elapsed_secs)) + qdisc_tcp_packets = deltas["qdisc_tcp_packets"] + if qdisc_tcp_packets != 0: + print("TCP packets sent: %5.3f M/sec (%.1f %% of all TCP packets)" % + (1e-6*deltas["pacer_tcp_packets"]/elapsed_secs, + 100*deltas["pacer_tcp_packets"]/qdisc_tcp_packets)) + else: + print("TCP packets sent: 0.000 M/sec") + print("TCP throughput (inc. headers): %5.2f Gbps" % + (8e-9*deltas["pacer_tcp_bytes"]/elapsed_secs)) + print("Helper throughput (Homa + TCP): %5.2f Gbps (%.1f%% of all pacer bytes)" % + (8e-9*deltas["pacer_help_bytes"]/elapsed_secs, + 100*deltas["pacer_help_bytes"]/pacer_bytes)) + backlog_secs = float(deltas["nic_backlog_cycles"])/(cpu_khz * 1000.0) + print("Active throughput: %5.2f Gbps (NIC backlogged %.1f%% of time)" % ( + pacer_bytes*8e-09/backlog_secs, 100*backlog_secs/elapsed_secs)) + xmit_secs = float(deltas["pacer_xmit_cycles"])/(cpu_khz * 1000.0) + print("Pacer thread duty cycle: %5.1f %%" % + (100*deltas["pacer_cycles"]/time_delta)) + print("Time xmitting packets: %5.1f %% (%.2f usec/packet)" % + (100*xmit_secs/elapsed_secs, + 1e6*xmit_secs/(deltas["pacer_homa_packets"] + + deltas["pacer_tcp_packets"]))) print("\nMiscellaneous:") print("--------------") if packets_received > 0: - print("Bytes/packet rcvd: %6.0f" % ( + print("Bytes/packet rcvd: %6.0f" % ( total_received_bytes/packets_received)) - print("Packets received: %5.3f M/sec" % ( + print("Packets received: %5.3f M/sec" % ( 1e-6*packets_received/elapsed_secs)) - print("Packets sent: %5.3f M/sec" % ( + print("Packets sent: %5.3f M/sec" % ( 1e-6*packets_sent/elapsed_secs)) - print("Core efficiency: %5.3f M packets/sec/core " + print("Core efficiency: %5.3f M packets/sec/core " "(sent & received combined)" % ( 1e-6*(packets_sent + packets_received)/elapsed_secs /total_cores_used)) - print(" %5.2f Gbps/core (goodput)" % ( + print(" %5.2f Gbps/core (goodput)" % ( 8e-9*(total_received_bytes + float(deltas["sent_msg_bytes"])) /(total_cores_used * elapsed_secs))) - if deltas["throttled_cycles"] != 0: - throttled_secs = float(deltas["throttled_cycles"])/(cpu_khz * 1000.0) - print("Pacer throughput: %5.2f Gbps" % ( - deltas["pacer_bytes"]*8e-09/throttled_secs)) + if deltas["skb_allocs"] != 0: + print("Skb alloc time: %4.2f usec/skb" % ( + float(deltas["skb_alloc_cycles"]) / (cpu_khz / 1000.0) / + deltas["skb_allocs"])) + if deltas["skb_page_allocs"] != 0: + print("Skb page alloc time: %5.2f usec/page" % ( + float(deltas["skb_page_alloc_cycles"]) / (cpu_khz / 1000.0) / + deltas["skb_page_allocs"])) print("\nCanaries (possible problem indicators):") print("---------------------------------------") - for symbol in ["requests_queued", "responses_queued"]: - delta = deltas[symbol] - if delta != 0: - received = deltas[symbol[:-7] + "_received"] - if (received != 0): - percent = "(%.1f%%)" % (100.0*float(delta)/float(received)) - percent = percent.ljust(12) - print("%-28s %15d %s %s" % (symbol, delta, percent, docs[symbol])) - for symbol in ["resent_packets", "resent_packets_used", "unknown_rpcs", + for symbol in ["resent_packets", "resent_packets_used", + "packet_discards", "resent_discards", "unknown_rpcs", "peer_kmalloc_errors", "peer_route_errors", "control_xmit_errors", "data_xmit_errors", "server_cant_create_rpcs", "server_cant_create_rpcs", - "short_packets", "redundant_packets", - "peer_timeouts", "server_rpc_discards", - "server_rpcs_unknown", "forced_reaps"]: + "short_packets", "rpc_timeouts", "server_rpc_discards", + "server_rpcs_unknown", "buffer_alloc_failures", + "dropped_data_no_bufs", "linux_pkt_alloc_bytes"]: if deltas[symbol] == 0: continue rate = float(deltas[symbol])/elapsed_secs rate_info = ("(%s/s) " % (scale_number(rate))).ljust(13) - print("%-28s %15d %s%s" % (symbol, deltas[symbol], + print("%-30s %15d %s%s" % (symbol, deltas[symbol], rate_info, docs[symbol])) - for symbol in ["pacer_lost_cycles", "timer_reap_cycles", - "data_pkt_reap_cycles"]: + for symbol in ["timer_reap_cycles", "data_pkt_reap_cycles", + "grant_lock_cycles"]: delta = deltas[symbol] if delta == 0 or time_delta == 0: continue percent = "(%.1f%%)" % (100.0*delta/time_delta) percent = percent.ljust(12) - print("%-28s %15d %s %s" % (symbol, delta, percent, docs[symbol])) + print("%-30s %15d %s %s" % (symbol, delta, percent, docs[symbol])) if deltas["throttle_list_adds"] > 0: - print("%-28s %15.1f List traversals per throttle " + print("%-30s %15.1f List traversals per throttle " "list insert" % ("checks_per_throttle_insert", deltas["throttle_list_checks"]/deltas["throttle_list_adds"])) if deltas["responses_received"] > 0: - print("%-28s %15.1f ACK packets sent per 1000 client RPCs" - % ("acks_per_rpc", 1000.0 * deltas["packets_sent_ACK"] + print("%-30s %15.1f ACK packets sent per 1000 client RPCs" + % ("acks_per_krpc", 1000.0 * deltas["packets_sent_ACK"] / deltas["responses_received"])) + if avg_grantable_rpcs > 1.0: + print("%-30s %6.2f %sAverage number of grantable incoming RPCs" % ( + "avg_grantable_rpcs", avg_grantable_rpcs, pad)) diff --git a/util/plot.py b/util/plot.py new file mode 100755 index 00000000..1af0516a --- /dev/null +++ b/util/plot.py @@ -0,0 +1,195 @@ +#!/usr/bin/python3 + +# Copyright (c) 2023 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +# This file provides a library of functions for generating plots. + +import matplotlib +import matplotlib.pyplot as plt +import os +from pathlib import Path +import re +import string +import sys + +from cperf import * + +# Standard colors for plotting +color_green = '#00B000' +color_blue = '#1759BB' +color_red = '#d62728' +tcp_color = '#00B000' +tcp_color2 = '#5BD15B' +tcp_color3 = '#96E296' +homa_color = '#1759BB' +homa_color2 = '#6099EE' +homa_color3 = '#A6C6F6' +dctcp_color = '#7A4412' +dctcp_color2 = '#CB701D' +dctcp_color3 = '#EAA668' +unloaded_color = '#d62728' + +matplotlib.rc('mathtext', default='regular') + +# Dictionary containing all data that has been read from files so far. +# Keys are file names, values are dictionaries of columns for that file, +# in which keys are column names and values are lists of the values +# in that column. +file_data = {} + +def __read_file(file): + """ + Read a file and add its contents to the file_data variable. If the + file has already been read, then this function does nothing. + + file: Path name of the file to read. Lines starting with '#' are + considered comments and ignored, as are blank lines. Of the + non-blank non-comment lines, the first contains space-separated + column names, and the others contain data for those columns. + """ + global file_data + + if file in file_data: + return + columns = {} + names = None + f = open(file) + for line in f: + fields = line.strip().split() + if len(fields) == 0: + continue + if fields[0].startswith('#'): + continue + if not names: + names = fields + for n in names: + if n in columns: + print('Duplicate column name %s in %s' % (file, n), + file=sys.stderr()) + columns[n] = [] + else: + if len(fields) != len(names): + print('Bad line in %s: %s (expected %d columns, got %d)' + % (file, line.rstrip(), len(columns), len(fields)), + file=sys.stderr) + continue + for i in range(0, len(names)): + try: + value = float(fields[i]) + except ValueError: + value = fields[i] + columns[names[i]].append(value) + f.close() + file_data[file] = columns + +def get_column(file, column): + """ + Return a list containing the values of a given column in a given file. + + file: Path name of the file containing the desired column. + column: Name of the column within that file. + """ + + __read_file(file) + if not column in file_data[file]: + raise Exception('Column %s doesn\'t exist in %s' % (column, name)) + return file_data[file][column] + +def get_column_names(file): + """ + Returns a list containing the names of all of the columns in file. + """ + + __read_file(file) + return file_data[file].keys() + +def get_numbers(file): + """ + Scans all of the column names in file for numbers and returns a + sorted list of all the unique numbers found. + """ + + numbers = set() + for name in get_column_names(file): + match = re.match('[^0-9]*([0-9]+)', name) + if match: + numbers.add(int(match.group(1))) + return sorted(list(numbers)) + +def max_value(file, columns): + """ + Returns the largest value in a set of columns. + + columns: A list of column names. + """ + + overall_max = None + for column in columns: + col_max = max(get_column(file, column)) + if (overall_max == None) or (col_max > overall_max): + overall_max = col_max + return overall_max + +def node_name(file): + """ + Given the name of a trace file, return a shorter name that can be + used (e.g. in titles) to identify the node represented by the file. + """ + name = Path(file).stem + i = name.rfind('_') + if i != -1: + name = name[i+1:] + return name + +def start_plot(max_x, max_y, title="", x_label="", y_label="", size=10, + figsize=[6,4]): + """ + Create a basic pyplot graph without plotting any data. Returns the + Axes object for the plot. + + max_x: Maximum x-coordinate + max_y: Maximum y-coordinate + title: Title for the plot; empty means no title + x_label: Label for x-axis + y_label: Label for y-axis + size: Size to use for fonts + figsize: Dimensions of plot + """ + + fig = plt.figure(figsize=figsize) + ax = fig.add_subplot(111) + if title != '': + ax.set_title(title, size=size) + ax.set_xlim(0, max_x) + ax.set_ylim(1, max_y) + if x_label: + ax.set_xlabel(x_label, size=size) + if y_label: + ax.set_ylabel(y_label, size=size) + return ax + +def plot_colors(file): + """ + Generates a test plot that shows the standard colors defined above. + + file: Name of PDF file in which to write the plot. + """ + + ax = start_plot(200, 100, title='Standard Colors') + ax.plot([0, 200], [65, 65], color=color_green, label='color_green') + ax.plot([0, 200], [60, 60], color=color_blue, label='color_blue') + ax.plot([0, 200], [55, 55], color=color_red, label='color_red') + ax.plot([0, 200], [50, 50], color=tcp_color, label='tcp_color') + ax.plot([0, 200], [45, 45], color=tcp_color2, label='tcp_color2') + ax.plot([0, 200], [40, 40], color=tcp_color3, label='tcp_color3') + ax.plot([0, 200], [35, 35], color=homa_color, label='homa_color') + ax.plot([0, 200], [30, 30], color=homa_color2, label='homa_color2') + ax.plot([0, 200], [25, 25], color=homa_color3, label='homa_color3') + ax.plot([0, 200], [20, 20], color=dctcp_color, label='dctcp_color') + ax.plot([0, 200], [15, 15], color=dctcp_color2, label='dctcp_color2') + ax.plot([0, 200], [10, 10], color=dctcp_color3, label='dctcp_color3') + ax.plot([0, 200], [5, 5], color=unloaded_color, label='unloaded_color') + ax.legend(loc='upper right', prop={'size': 9}) + plt.tight_layout() + plt.savefig(file) \ No newline at end of file diff --git a/util/plot_tthoma.py b/util/plot_tthoma.py new file mode 100755 index 00000000..27aee7e6 --- /dev/null +++ b/util/plot_tthoma.py @@ -0,0 +1,90 @@ +#!/usr/bin/python3 + +# Copyright (c) 2023 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +# This file provides a collection of functions that plot data generated +# by tthoma.py. Invoke with the --help option for more information. + +from glob import glob +from optparse import OptionParser +import math +import matplotlib +import matplotlib.pyplot as plt +import os +from pathlib import Path +import re +import string +import sys + +import plot + +def backlog(data_file, plot_file): + """ + Generates a plot of network backlog data produced by the "net" + analyzer of tthoma.py. + + data_file: Backlog data file generated by tthoma.py. + plot_file: Name of the file in which to output a plot. + """ + global options + + cores = plot.get_numbers(data_file) + if options.cores: + cores = sorted(list(set(cores).intersection(options.cores))) + columns = [] + core_names = [] + for core in cores: + columns.append('Back%d' % core) + core_names.append('C%02d' % core) + times = plot.get_column(data_file, 'Time') + xmax = max(times) + ymax = plot.max_value(data_file, columns) + + ax = plot.start_plot(xmax, ymax, x_label='Time', + y_label='KB In Flight For %s Cores' % (plot.node_name(data_file))) + for i in range(len(columns)): + ax.plot(times, plot.get_column(data_file, columns[i]), + label=core_names[i], linewidth=0.8) + ax.legend(loc='upper right', prop={'size': 9}) + plt.tight_layout() + plt.savefig(plot_file) + + +def colors(plot_file): + """ + Generates a plot displaying standard colors. + + plot_file: Name of the file in which to output a plot. + """ + + plot.plot_colors(plot_file) + +# Parse command-line options. +parser = OptionParser(description= + 'Reads data output by tthoma.py and generates a plot. func is ' + 'the name of a function in this file, which will be invoked to ' + 'generate a particular plot; args provide additional information to ' + 'func if needed. Read the in-code documentation for the functions ' + 'for details on what kinds of plots are available.', + usage='%prog [options] func arg arg ...', + conflict_handler='resolve') +parser.add_option('--cores', dest='cores', default=None, + metavar='CORES', help='space-separated list of integer core numbers; ' + 'plots will include data from these cores only, where appropriate') +(options, args) = parser.parse_args() + +if options.cores != None: + options.cores = list(map(int, options.cores.split(" "))) + +if len(args) < 1: + print('No func was specified') + parser.print_help() + exit(1) + +if not args[0] in locals(): + print('There is no function %s' % (args[0])) + parser.print_help() + exit(1) + +locals()[args[0]](*args[1:]) \ No newline at end of file diff --git a/util/receive_raw.c b/util/receive_raw.c index 0e79978f..fb37eb4f 100644 --- a/util/receive_raw.c +++ b/util/receive_raw.c @@ -1,16 +1,5 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2019-2022 Homa Developers + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This is a test program that uses a raw socket to receive packets diff --git a/util/rpcid.py b/util/rpcid.py index ac0cd4ad..62a7e949 100755 --- a/util/rpcid.py +++ b/util/rpcid.py @@ -1,18 +1,7 @@ #!/usr/bin/python3 -# Copyright (c)2023 Stanford University -# -# Permission to use, copy, modify, and distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c)2023 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ Analyzes Homa timetraces on two different machines to extract a diff --git a/util/send_raw.c b/util/send_raw.c index 76129125..f6a25fcf 100644 --- a/util/send_raw.c +++ b/util/send_raw.c @@ -1,16 +1,5 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2019-2022 Homa Developers + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This is a test program that will send a packet to a given @@ -29,6 +18,7 @@ #include #include "homa.h" +#include "test_utils.h" int main(int argc, char** argv) { int fd, status; @@ -37,7 +27,7 @@ int main(int argc, char** argv) { char *message; char *host; int protocol; - sockaddr_in_union *addr; + union sockaddr_in_union *addr; uint8_t *bytes; if (argc < 3) { @@ -66,7 +56,7 @@ int main(int argc, char** argv) { host, gai_strerror(status)); exit(1); } - addr = (sockaddr_in_union*) result->ai_addr; + addr = (union sockaddr_in_union*) result->ai_addr; bytes = (uint8_t *) &addr->in4.sin_addr; printf("Destination address: %x (%d.%d.%d.%d)\n", addr->in4.sin_addr.s_addr, bytes[0], bytes[1], bytes[2], bytes[3]); diff --git a/util/server.cc b/util/server.cc index 3ff09280..a87d753d 100644 --- a/util/server.cc +++ b/util/server.cc @@ -1,16 +1,5 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2019-2022 Homa Developers + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This is a test program that acts as a server for testing either @@ -40,7 +29,9 @@ #include +extern "C" { #include "homa.h" +} #include "test_utils.h" /* Log events to standard output. */ @@ -66,16 +57,18 @@ int inet_family = AF_INET; */ void homa_server(int port) { - int fd; - sockaddr_in_union addr; - sockaddr_in_union source; - int length; + struct homa_sendmsg_args reply_args; + struct iovec vecs[HOMA_MAX_BPAGES]; struct homa_recvmsg_args recv_args; + struct homa_rcvbuf_args arg; + struct msghdr reply_msghdr; + sockaddr_in_union source; + sockaddr_in_union addr; struct msghdr hdr; - struct homa_set_buf_args arg; char *buf_region; - struct iovec vecs[HOMA_MAX_BPAGES]; int num_vecs; + int length; + int fd; fd = socket(inet_family, SOCK_DGRAM, IPPROTO_HOMA); if (fd < 0) { @@ -87,7 +80,7 @@ void homa_server(int port) addr.in4.sin_port = htons(port); if (bind(fd, &addr.sa, sizeof(addr)) != 0) { printf("Couldn't bind socket to Homa port %d: %s\n", port, - strerror(errno)); + strerror(errno)); return; } if (verbose) @@ -95,17 +88,18 @@ void homa_server(int port) // Set up buffer region. buf_region = (char *) mmap(NULL, 1000*HOMA_BPAGE_SIZE, - PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); + PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); if (buf_region == MAP_FAILED) { printf("Couldn't mmap buffer region: %s\n", strerror(errno)); return; } - arg.start = buf_region; + arg.start = (uintptr_t)buf_region; arg.length = 1000*HOMA_BPAGE_SIZE; - int status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_SET_BUF, &arg, - sizeof(arg)); + int status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_RCVBUF, &arg, + sizeof(arg)); if (status < 0) { - printf("Error in setsockopt(SO_HOMA_SET_BUF): %s\n", + printf("Error in setsockopt(SO_HOMA_RCVBUF): %s\n", strerror(errno)); return; } @@ -122,28 +116,33 @@ void homa_server(int port) int seed; int result; + /* Note: by reusing recv_args for successive calls to + * recvmsg we automatically return to Homa the buffers + * left in recv_args by the previous call to recvmsg. + */ recv_args.id = 0; - recv_args.flags = HOMA_RECVMSG_REQUEST; + hdr.msg_controllen = sizeof(recv_args); length = recvmsg(fd, &hdr, 0); if (length < 0) { printf("recvmsg failed: %s\n", strerror(errno)); continue; } - int resp_length = ((int *) (buf_region + recv_args.bpage_offsets[0]))[1]; + int resp_length = ((int *) (buf_region + + recv_args.bpage_offsets[0]))[1]; if (validate) { seed = check_message(&recv_args, buf_region, length, - 2*sizeof32(int)); + 2*sizeof32(int)); if (verbose) printf("Received message from %s with %d bytes, " - "id %lu, seed %d, response length %d\n", - print_address(&source), length, - recv_args.id, seed, resp_length); + "id %llu, seed %d, response length %d\n", + print_address(&source), length, + recv_args.id, seed, resp_length); } else if (verbose) printf("Received message from %s with " - "%d bytes, id %lu, response length %d\n", - print_address(&source), length, - recv_args.id, resp_length); + "%d bytes, id %llu, response length %d\n", + print_address(&source), length, + recv_args.id, resp_length); /* Second word of the message indicates how large a * response to send. @@ -152,14 +151,18 @@ void homa_server(int port) while (resp_length > 0) { vecs[num_vecs].iov_len = (resp_length > HOMA_BPAGE_SIZE) ? HOMA_BPAGE_SIZE : resp_length; - vecs[num_vecs].iov_base = buf_region - + recv_args.bpage_offsets[num_vecs]; + vecs[num_vecs].iov_base = buf_region + + recv_args.bpage_offsets[num_vecs]; resp_length -= vecs[num_vecs].iov_len; num_vecs++; } - result = homa_replyv(fd, vecs, num_vecs, &source, recv_args.id); + init_sendmsg_hdrs(&reply_msghdr, &reply_args, vecs, num_vecs, + &source.sa, sockaddr_size(&source.sa)); + reply_args.id = recv_args.id; + result = sendmsg(fd, &reply_msghdr, 0); if (result < 0) { - printf("homa_reply failed: %s\n", strerror(errno)); + printf("sendmsg for reply failed: %s\n", + strerror(errno)); } } } diff --git a/util/service.py b/util/service.py index 55c10796..0fd54cbd 100755 --- a/util/service.py +++ b/util/service.py @@ -1,18 +1,7 @@ #!/usr/bin/python3 -# Copyright (c) 2019-2022 Stanford University -# -# Permission to use, copy, modify, and distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2019-2022 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ Compute service times for RPCs from a server-side trace. diff --git a/util/smi.cc b/util/smi.cc index 69b0a8f9..4770f417 100644 --- a/util/smi.cc +++ b/util/smi.cc @@ -1,16 +1,5 @@ -/* Copyright (c) 2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2022 Homa Developers + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This program spawns a collection of threads on different cores to diff --git a/util/strip.py b/util/strip.py new file mode 100755 index 00000000..b47eacbc --- /dev/null +++ b/util/strip.py @@ -0,0 +1,416 @@ +#!/usr/bin/python3 + +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +""" +This script is used to copy information from the Homa GitHub repo to +a Linux kernel repo, removing information that doesn't belong in the +official kernel version (such as calls to tt_record). + +Usage: strip.py file file file ... destdir + +Each of the files will be read, stripped as appropriate, and copied to a +file by the same name in destdir. If there is only a single file and no +destdir, then the stripped file is printed on standard output. + +The following code is removed automatically: + * Calls to timetracing, such as tt_record* + * Blocks conditionalized on '#ifdef __UNIT_TEST__' + * UNIT_LOG and UNIT_HOOK statements + * INC_METRIC statements + * IF_NO_STRIP statements + +Additional stripping is controlled by #ifdefs. The #ifdefs allow the +code to be used in three ways: +* Normal compilation in a development environment: includes unit testing + and timetracing support, nothing is stripped. The code is compiled as + is. +* Upstreaming: source files are run through this program, which produces + a statically-stripped version. +* Compile-time stripping: the code is compiled as is, but "__STRIP__=y" is + set on the make command line (both for compiling Homa and for unit testing). + This omits almost all of the information that must be omitted for + upstreaming, but retains a few debugging facilities like timetracing. + +Here are details about the #ifdefs used for stripping: + +* This entire block will be removed in the stripped version, but it will + be compiled in normal mode: + #ifndef __STRIP__ /* See strip.py */ + ... + #endif /* See strip.py */ + +* This entire block will be removed in the stripped version, but it will + be compiled in both normal mode and with compile-time stripping. + #ifndef __UPSTREAM__ /* See strip.py */ + ... + #endif /* See strip.py */ + +* The #if and #endif statements will be removed, leaving just the code + in between. The code will be compiled in compile-time stripping mode + #ifdef __STRIP__ /* See strip.py */ + ... + #endif /* See strip.py */ + +* Everything will be removed except the code between #else and #endif. + During normal mode the #ifndef block will be compiled; under compile-time + stripping the #else block will be compiled. + #ifndef __STRIP__ /* See strip.py */ + ... + #else /* See strip.py */ + ... + #endif /* See strip.py */ +""" + +from collections import defaultdict +from glob import glob +from optparse import OptionParser +import math +import os +from pathlib import Path +import re +import string +import sys + +exit_code = 0 + +def remove_close(line): + """ + Given a line of text containing a '}', remove the '}' and any + following white space. If there is no '}', returns the original line. + """ + i = line.rfind('}') + if i < 0: + return line + for j in range(i+1, len(line), 1): + if line[j] != ' ': + break + return line[0:i] + line [j:] + +def remove_open(line): + """ + Given a line of text containing a '{', remove the '{' and any + preceding white space. If there is no '{', returns the original line. + """ + i = line.rfind('{') + if i < 0: + return line + j = -1 + for j in range(i-1, -1, -1): + if line[j] != ' ': + break + return line[0:j+1] + line [i+1:] + +def leading_space(line): + """ + Return the number of characters of leading space in a line (a tab counts + as 8 spaces). + """ + + count = 0 + for c in line: + if c == ' ': + count += 1 + elif c == '\t': + count += 8 + else: + break + return count + +def last_non_blank(s): + """ + Return the last non-blank character in s, or None if there is no + non-blank character in s. + """ + s2 = s.rstrip() + if s2: + return s2[-1] + return None + +def blank_next_ok(line): + """ + Given a line, return True if it is OK for this line to be followed by + a blank line. False means that if the next line to be output is blank, + it should be dropped. + """ + s = line.strip() + if s == '': + return False + if s.endswith('{') or s.endswith('*/'): + return False + return True + +def scan(file): + """ + Read a file, remove information that shouldn't appear in the Linux kernel + version, and return an array of lines representing the stripped file. + file: Pathname of file to read + """ + + global exit_code + + # True means the current line ends in the middle of a /* ... */ comment + in_comment = False + + # True means we're in the middle of a multi-line statement that + # should be skipped (drop until a semicolon is seen). + skip_statement = False + + # Values of 0 or 1 mean we're in the middle of a group of lines labeled + # with '#ifndef __STRIP__' or "#ifdef __STRIP__". 0 means we're including + # lines, 1 means we're stripping them. None means we're not in such a + # group. + in_labeled_skip = None + + # Used to strip out unit testing code. Value is one of: + # None: We're not in the middle of an '#ifdef __UNIT_TEST__' + # 'if': An '#idfdef __UNIT_TEST__" has been seen, but the + # corresponding #else or #endif has not been seen yet + # 'else': We are in the middle of an '#else' clause for an + # '#ifdef __UNIT_TEST__' + in_unit = None + + # Used to strip out conditional code based on version + # None: We're not in the middle of an '#if LINUX_VERSION_CODE' + # 'if': An '#if LINUX_VERSION_CODE" has been seen, but not the + # corresponding #else or #endif (code should be stripped) + # 'else': We are in the middle of an '#else' clause for an + # '#if LINUX_VERSION_CODE' (this code should remain) + in_version = None + + # Array of lines containing the stripped version of the file + slines = [] + + # Index in slines of the most recent line ending with a '{', or None + # if none. Only valid for innermost blocks (those with no nested blocks). + open_index = None + + # Number of statements that have been seen since the last '{': used to + # eliminate curly braces around blocks that end up with only a single + # statement. Set to a number > 1 if there isn't an "interesting" + # current block. + statements_in_block = 100 + + # True means lines were automatically deleted in the current block; + # at the end of the block, see if curly braces are no longer needed. + check_braces = False + + # Used when deleted statements like tt_record are surrounded on both + # sides by empty lines; the second empty line will be deleted. + delete_empty_line = False + + line_num = 0 + + f = open(file) + for line in f: + line_num += 1 + + # pline is used for parsing; it is modified to remove + # uninteresting information such as whitespace. + pline = line.strip() + + if pline.startswith('//') and not 'SPDX-License' in pline: + # Strip // comment lines: these are used only for commenting + # out debugging code. + continue + + # Extract the part of the line that is *not* in a /*...*/ comment + # (assume at most one comment per line). + cstart = pline.find('/*') + cend = pline.find('*/') + if cstart >= 0: + if cend >= 0: + non_comment = pline[0:cstart] + pline[cend+2:] + in_comment = False + else: + non_comment = pline[0:cstart] + in_comment = True + elif cend >= 0: + non_comment = pline[cend+2:] + in_comment = False + elif in_comment: + non_comment = '' + else: + non_comment = pline + non_comment = non_comment.strip() + + # Strip groups of lines labeled with '#ifndef __STRIP__' or + # '#ifndef __UPSTREAM__'. Note: don't do brace elimination here: + # this gives greater control to the __STRIP__ code. + if in_labeled_skip != None: + if line.startswith('#endif /* See strip.py */'): + in_labeled_skip = None + continue + elif line.startswith('#else /* See strip.py */'): + in_labeled_skip = 0 + continue + if in_labeled_skip == 1: + continue + if line.startswith('#ifndef __STRIP__ /* See strip.py */') or ( + line.startswith('#ifndef __UPSTREAM__ /* See strip.py */')): + if not blank_next_ok(slines[-1]): + delete_empty_line = True + in_labeled_skip = 1 + check_braces = True + continue + if line.startswith('#ifdef __STRIP__ /* See strip.py */') : + if not blank_next_ok(slines[-1]): + slines.pop() + in_labeled_skip = 0 + check_braces = True + continue + + # Strip tt_freeze() statements. + if pline == 'tt_freeze();': + check_braces = True + if not blank_next_ok(slines[-1]): + delete_empty_line = True + continue + + if skip_statement: + if pline[-1] == ';': + skip_statement = False + check_braces = True + continue + + # Strip tt_record, INC_METRIC, and IF_NO_STRIP statements. + match = re.match('(//[ \t]*)?tt_record[1-4]?[(]', pline) + if not match: + match = re.match('(//[ \t]*)?INC_METRIC[(]', pline) + if not match: + match = re.match('(//[ \t]*)?IF_NO_STRIP[(]', pline) + if match: + # If this is the only statement in its block, delete the + # outer block statement (if, while, etc.). Don't delete case + # statements. + if not match.group(1): + indent = leading_space(line) + for i in range(len(slines)-1, -1, -1): + prev = slines[i] + prev_indent = leading_space(prev) + if last_non_blank(prev) == '{': + break + if prev_indent == 0: + # Label or method start; no need to continue further + break + if leading_space(prev) < indent: + if not prev.lstrip().startswith('case'): + slines = slines[:i] + break + + if pline[-1] != ';': + skip_statement = True + if not blank_next_ok(slines[-1]): + delete_empty_line = True + check_braces = True + continue + + # Strip UNIT_LOG and UNIT_HOOK statements. + if (pline.startswith('UNIT_LOG(') or pline.startswith('UNIT_HOOK(')): + if pline[-1] != ';': + skip_statement = True + if not blank_next_ok(slines[-1]): + delete_empty_line = True + check_braces = True + continue + + # Strip #include "homa_strip.h" statements. + if pline.startswith('#include "homa_strip.h"'): + if not blank_next_ok(slines[-1]): + delete_empty_line = True + continue + + # Strip '#ifdef __UNIT_TEST__' blocks (keep #else clauses) + if in_unit: + if line.startswith('#endif /* __UNIT_TEST__ */'): + in_unit = None + continue + if line.startswith('#else /* __UNIT_TEST__ */'): + in_unit = 'else' + continue + if in_unit == 'if': + continue + elif line.startswith('#ifdef __UNIT_TEST__'): + in_unit = 'if' + if not blank_next_ok(slines[-1]): + delete_empty_line = True + continue + elif line.startswith('#ifndef __UNIT_TEST__'): + in_unit = 'else' + if not blank_next_ok(slines[-1]): + delete_empty_line = True + continue + + # Strip 'if LINUX_VERSION_CODE' blocks (keep #else clauses) + if in_version: + if line.startswith('#endif'): + in_version = None + continue + if line.startswith('#else'): + in_version = 'else' + continue + if in_version == 'if': + continue + elif line.startswith('#if LINUX_VERSION_CODE'): + in_version = 'if' + if not blank_next_ok(slines[-1]): + delete_empty_line = True + continue + + if not pline: + if not line.isspace() or not delete_empty_line: + slines.append(line) + delete_empty_line = False + continue + delete_empty_line = False + + # Remove braces for blocks that now have only a single statement + if pline == '}' or pline.startswith('} else'): + if slines[-1].strip() == '': + slines.pop() + if check_braces: + check_braces = False + if open_index != None: + if statements_in_block == 0: + print('%s:%d: stripping creates empty block' % + (file, line_num), file=sys.stderr) + exit_code = 1 + if statements_in_block == 1: + slines[open_index] = remove_open(slines[open_index]) + line = remove_close(line) + if not line.strip(): + open_index = None + continue + open_index = None + if pline[-1] == '{' and line[0] != '{': + statements_in_block = 0 + open_index = len(slines) + check_braces = False + + # Count statements + if non_comment and non_comment[-1] == ';': + statements_in_block += 1 + + # The current line needs to be retained in the output. + slines.append(line) + f.close() + return slines + +if __name__ == '__main__': + f = sys.stdin + if len(sys.argv) < 2: + print('Usage: strip.py [--alt] file [file ... destdir]', file=sys.stderr) + exit(1) + if len(sys.argv) == 2: + for line in scan(sys.argv[1]): + print(line, end='') + else: + for file in sys.argv[1:-1]: + dst_file = '%s/%s' % (sys.argv[-1], file) + print('Stripping %s into %s' % (file, dst_file)) + slines = scan(file) + dst = open(dst_file, 'w') + for line in slines: + print(line, end='', file=dst) + dst.close() + sys.exit(exit_code) \ No newline at end of file diff --git a/util/strip_decl.py b/util/strip_decl.py new file mode 100755 index 00000000..65c23d07 --- /dev/null +++ b/util/strip_decl.py @@ -0,0 +1,170 @@ +#!/usr/bin/python3 + +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ or GPL-2.0+ + +""" +This script is used to make a copy of homa_impl.h that seletively omits +certain function definitions, depending on which patch in a patch series +is being generated. + +Usage: strip_decl.py src dst patch + +Src gives the file to read, dst names the file to (over)write, and patch +identifies the specific patch that is being generated (it must be one +of the initial values in a sublist of symbols below). +""" + +from collections import defaultdict +from glob import glob +import math +import os +import re +import string +import sys + +# Each list element is a list containing a patch name followed by any number +# of line prefixes. The lists are in patch order: a line will be excluded +# from the output file it starts with one of the prefixes for a patch *after* +# the one specified on the command line. The "none" patch includes no symbols, +# "all" includes all symbols. +symbols = [ + ['none'], + ['peer', + 'int homa_xmit_control(' + ], + ['rpc', + 'int homa_message_in_init(', + 'void homa_rpc_handoff(', + 'void homa_xmit_data(' + ], + ['outgoing', + 'int homa_fill_data_interleaved(', + 'int homa_message_out_fill(', + 'void homa_message_out_init(', + 'void homa_resend_data(', + 'int homa_rpc_tx_end(', + 'struct sk_buff *homa_tx_data_pkt_alloc(', + 'int __homa_xmit_control(', + 'void __homa_xmit_data(', + 'void homa_xmit_unknown(' + ], + ['utils', + 'void homa_destroy(', + 'int homa_init(', + 'void homa_net_destroy(', + 'int homa_net_init(', + 'void homa_spin(' + ], + ['incoming', + 'void homa_ack_pkt(', + 'void homa_add_packet(', + 'int homa_copy_to_user(', + 'void homa_data_pkt(', + 'void homa_dispatch_pkts(', + 'struct homa_gap *homa_gap_alloc(', + 'void homa_need_ack_pkt(', + 'void homa_request_retrans(', + 'void homa_resend_pkt(', + 'void homa_rpc_unknown_pkt(', + 'int homa_wait_private(', + 'struct homa_rpc *homa_wait_shared(' + ], + ['timer', + 'void homa_timer(', + 'void homa_timer_check_rpc(', + 'int homa_timer_main(' + ], + ['plumbing', + 'int homa_bind(', + 'void homa_close(', + 'int homa_err_handler_v4(', + 'int homa_err_handler_v6(', + 'int homa_getsockopt(', + 'int homa_hash(', + 'enum hrtimer_restart homa_hrtimer(', + 'int homa_ioctl(', + 'int homa_load(', + 'void homa_net_exit(', + 'int homa_net_start(', + '__poll_t homa_poll(', + 'int homa_recvmsg(', + 'int homa_sendmsg(', + 'int homa_setsockopt(', + 'int homa_shutdown(', + 'int homa_socket(', + 'int homa_softirq(', + 'void homa_unhash(', + 'void homa_unload(' + ], + ['all'] +] + +# A list of all of the line prefixes that have not yet been encountered +# in the source file. Used to print error messages at the end for any +# that don't appear anywhere in the file. +unseen = [] + +for patch in symbols: + for prefix in patch[1:]: + unseen.append(prefix) + +if len(sys.argv) != 4: + print('Usage: strip_decl.py src dst patch') + exit(1) + +src = open(sys.argv[1]) +dst = open(sys.argv[2], 'w') +patch_name = sys.argv[3] +found_patch = False +for patch in symbols: + if patch[0] == patch_name: + found_patch = True + break +if not found_patch: + print('Unknown patch name "%s"' % (patch_name), file=sys.stderr) + exit(1) +skipping_to_semi = False +prev_line_empty = False +for line in src: + if skipping_to_semi: + if line.endswith(';\n'): + skipping_to_semi = False + continue + + for prefix in unseen: + if line.startswith(prefix): + unseen.remove(prefix) + break; + + found_patch = False + omit = False + for patch in symbols: + if found_patch: + for prefix in patch[1:]: + if line.startswith(prefix): + omit = True + break + if omit: + break + if patch_name == patch[0]: + found_patch = True + if omit: + if not line.endswith(';\n'): + skipping_to_semi = True + else: + if line == '\n': + prev_line_empty = True + else: + if prev_line_empty: + print('', file=dst,) + print(line, file=dst, end='') + prev_line_empty = False + +if unseen: + print('The following prefixes did not appear in %s:' % (sys.argv[1]), + file=sys.stderr) + for prefix in unseen: + print(prefix, file=sys.stderr) + +dst.close() +src.close() \ No newline at end of file diff --git a/util/test_time_trace.c b/util/test_time_trace.c deleted file mode 100644 index 2c4d6a83..00000000 --- a/util/test_time_trace.c +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -/* This program exercises the Linux kernel time trace mechanism - * by calling a new system call that creates time traces. - */ - -#include -#include -#include -#include -#include -#include -#include - -int main(int argc, char** argv) { - int i; - printf("Invoking new 'test_timetrace' syscall.\n"); - for (i = 0; i < 100; i++) { - int status = syscall(334); - if (status < 0) { - printf(" Error in test_timetrace: %s (%d)", - strerror(errno), errno); - } - } - return 0; -} - diff --git a/util/test_utils.cc b/util/test_utils.cc index 1a7abe7d..81d3b561 100644 --- a/util/test_utils.cc +++ b/util/test_utils.cc @@ -1,16 +1,5 @@ -/* Copyright (c) 2019-2023 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2019-2023 Homa Developers + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file contains a collection of functions that are useful in @@ -228,8 +217,6 @@ void seed_buffer(void *buffer, size_t length, int seed) /** * print_address() - Generate a human-readable description of an inet address. * @addr: The address to print - * @buffer: Where to store the human readable description. - * @size: Number of bytes available in buffer. * * Return: The address of the human-readable string (buffer). * @@ -237,7 +224,7 @@ void seed_buffer(void *buffer, size_t length, int seed) * strings, so callers don't have to worry about allocating space, even if * several addresses are in use at once. This function is also thread-safe. */ -const char *print_address(const sockaddr_in_union *addr) +const char *print_address(const union sockaddr_in_union *addr) { // Avoid cache line conflicts: @@ -282,3 +269,28 @@ double to_seconds(uint64_t cycles) { return ((double) (cycles))/get_cycles_per_sec(); } + +/** + * split() - Splits a string into substrings separated by a given character. + * @s: String to split + * @sep: Separater character + * @dest: Substrings are appended to this vector (if @sep doesn't + * appear in @s, then @s is appended). + */ +void split(const char *s, char sep, std::vector &dest) +{ + while (1) { + const char *end; + while (*s == sep) + s++; + if (*s == 0) + break; + end = strchr(s, sep); + if (end == NULL) { + dest.emplace_back(s); + break; + } + dest.emplace_back(s, end-s); + s = end; + } +} \ No newline at end of file diff --git a/util/test_utils.h b/util/test_utils.h index d583d598..aeef2c1d 100644 --- a/util/test_utils.h +++ b/util/test_utils.h @@ -1,16 +1,5 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2019-2022 Homa Developers + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ #ifndef _TEST_UTILS_H @@ -18,6 +7,11 @@ #include +#ifdef __cplusplus +#include +#include +#endif + #include "homa.h" #ifdef __cplusplus @@ -25,6 +19,26 @@ extern "C" { #endif +/** + * Holds either an IPv4 or IPv6 address (smaller and easier to use than + * sockaddr_storage). + */ +union sockaddr_in_union { + struct sockaddr sa; + struct sockaddr_in in4; + struct sockaddr_in6 in6; +}; + +/** + * sockaddr_size() - Return the number of bytes used by the argument. + * @sa: Pointer to either an IPv4 or an IPv6 address. + */ +static inline uint32_t sockaddr_size(const struct sockaddr *sa) +{ + return (sa->sa_family == AF_INET) ? sizeof(struct sockaddr_in) : + sizeof(struct sockaddr_in6); +} + #define sizeof32(type) static_cast(sizeof(type)) extern int check_buffer(void *buffer, size_t length); @@ -33,10 +47,13 @@ extern int check_message(struct homa_recvmsg_args *control, extern double get_cycles_per_sec(); extern int get_int(const char *s, const char *msg); extern void pin_thread(int core); +extern const char* + print_address(const union sockaddr_in_union *addr); extern void print_dist(uint64_t times[], int count); extern void seed_buffer(void *buffer, size_t length, int seed); -extern const char* - print_address(const sockaddr_in_union *addr); +#ifdef __cplusplus +extern void split(const char *s, char sep, std::vector &dest); +#endif extern double to_seconds(uint64_t cycles); /** @@ -50,6 +67,37 @@ inline static uint64_t rdtsc(void) return (((uint64_t)hi << 32) | lo); } +/** + * init_sendmsg_hdrs(): Convenience function to initialize the two headers + * needed to invoke sendmsg for Homa. This initializes for the common case; + * callers may need to set some fields explicitly for less common cases. + * @hdr: msghdr argument to sendmsg: will be initialized here. + * @args: Homa's sendmsg arguments; will be initialized here. + * @iov: Describes outgoing message. + * @iovcnt: Number of entries in @iov. + * @dest_addr: Target for the message. + * @addrlen: Size of @dest_addr (bytes). + */ +inline static void init_sendmsg_hdrs(struct msghdr *hdr, + struct homa_sendmsg_args *args, + struct iovec *iov, int iovcnt, + const struct sockaddr *dest_addr, + __u32 addrlen) +{ + args->id = 0; + args->completion_cookie = 0; + args->flags = 0; + args->reserved = 0; + + hdr->msg_name = (struct sockaddr *)dest_addr; + hdr->msg_namelen = addrlen; + hdr->msg_iov = iov; + hdr->msg_iovlen = iovcnt; + hdr->msg_control = args; + hdr->msg_controllen = 0; + hdr->msg_flags = 0; +} + #ifdef __cplusplus } #endif diff --git a/util/time_trace.cc b/util/time_trace.cc index dc7beb22..9d815cff 100644 --- a/util/time_trace.cc +++ b/util/time_trace.cc @@ -1,16 +1,5 @@ -/* Copyright (c) 2014-2022 Stanford University - * - * Permission to use, copy, modify, and distribute this software for any purpose - * with or without fee is hereby granted, provided that the above copyright - * notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER - * RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF - * CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2014-2022 Homa Developers + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ #include diff --git a/util/time_trace.h b/util/time_trace.h index 7f2c3f4f..cb06236d 100644 --- a/util/time_trace.h +++ b/util/time_trace.h @@ -1,16 +1,5 @@ -/* Copyright (c) 2020-2022 Stanford University - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2020-2022 Homa Developers + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ #ifndef TIMETRACE_H diff --git a/util/ttcore.py b/util/ttcore.py deleted file mode 100755 index 05264a89..00000000 --- a/util/ttcore.py +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/python3 - -# Copyright (c) 2019-2022 Stanford University -# -# Permission to use, copy, modify, and distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - -""" -Scan the timetrace data in a log file; for records containing certain -substrings, compute statistics for how often those records occur on each -core. -Usage: ttcore.py [substring substring ...] [file] -Each substring argument selects a collection of entries in the timetrace; -each collection will be analyzed separately for core usage. If no substrings -are specified, a default collection will be used. File gives the name of the -timetrace file to use (stdin is used if no file is specified). -""" - -from __future__ import division, print_function -from glob import glob -from optparse import OptionParser -import math -import os -import re -import string -import sys - -# The substrate arguments that we are matching against timetrace entries -substrings = [] - -# For each entry in substrings there is an entry in this array, which -# consists of an array of counts (how many times a timetrace entry matching -# the substring occurred on each core). -cores = [] - -# Highest core number seen -max_core = 0 - -def scan(f): - """ - Scan the log file given by 'f' and accumulate core statistics. - """ - - global substrings, cores, max_core - startTime = 0.0 - prevTime = 0.0 - writes = 0 - for line in f: - match = re.match(' *([-0-9.]+) us .* \[C([0-9]+)\] (.*)', - line) - if not match: - print("Line didn't match: %s" % (line)) - continue - time = float(match.group(1)) - core = int(match.group(2)) - if core > max_core: - max_core = core - event = match.group(3) - for i in range(0, len(substrings)): - if substrings[i] in event: - c = cores[i] - while len(c) <= core: - c.append(0) - c[core] += 1 - -f = sys.stdin -substrings = [] -if len(sys.argv) > 1: - try: - f = open(sys.argv[-1]) - substrings = sys.argv[1:-1] - except: - substrings = sys.argv[1:] - -if len(substrings) == 0: - substrings = ["mlx processed", - "homa_softirq: first", - "homa_recvmsg returning", - "homa_sendmsg request", - "mlx_xmit starting, id", - "pacer calling", - "tcp_v4_rcv invoked", - "tcp_recvmsg returning" - ] - -for i in range(0, len(substrings)): - cores.append([]) - -scan(f) - -max_length = 0 -for i in range(0, len(substrings)): - length = len(substrings[i]) - if length > max_length: - max_length = length - while len(cores[i]) <= max_core: - cores[i].append(0) - -line = "Event Substring Core 0" -for i in range (1, len(cores[0])): - line += " %5d" % (i) -print(line) -for i in range(0, len(substrings)): - line = "%-*s " % (max_length+1, substrings[i] + ":") - for count in cores[i]: - line += " %5d" % (count) - print(line) \ No newline at end of file diff --git a/util/ttgrants.py b/util/ttgrants.py deleted file mode 100755 index dad12286..00000000 --- a/util/ttgrants.py +++ /dev/null @@ -1,325 +0,0 @@ -#!/usr/bin/python3 - -""" -Scans a timetrace to compute grant lag: how long it takes after a -grant is issued for the granted packet to arrive. Also computes -statistics on when grants arrive compared to when they are needed -to transmit at full bandwidth. -Usage: ttgrant.py [tt_file] - -The existing timetrace is in tt_file (or stdin in tt_file is omitted). -""" - -from __future__ import division, print_function -from glob import glob -from optparse import OptionParser -import math -import os -import re -import string -import sys -from statistics import median - -def percentile(list, pct, format, na): - """ - Finds the element of list corresponding to a given percentile pct - (0 is first, 100 or more is last), formats it according to format, - and returns the result. Returns na if the list is empty. - """ - if len(list) == 0: - return na - i = int(pct*len(list)/100) - if i >= len(list): - i = len(list) - 1 - return format % (list[i]) - -verbose = False -if (len(sys.argv) >= 2) and (sys.argv[1] == "--verbose"): - verbose = True - sys.argv.pop(1) -if len(sys.argv) == 2: - f = open(sys.argv[1]) -elif len(sys.argv) == 1: - f = sys.stdin -else: - print("Usage: %s [--verbose] [tt_file]" % (sys.argv[0])) - sys.exit(1) - -# Network link speed in Gbps. -gbps = 40 - -# Collects all the observed grant latencies (time from sending grant -# to receiving first data packet enabled by grant), in microseconds -latencies = [] - -# Keys are RPC ids. Each value is a list of lists, one per outstanding -# grant, where each sublist consists of a pair -# griple identifying one grant -out_grants = {} - -# Keys are RPC ids, values are the highest offset seen in any grant -# for the RPC (including the initial "grant" for unscheduled data). -last_grant = {} - -# Largest observed incoming packet size (presumably a full GSO packet?). -packet_size = 0 - -# Keys are outgoing RPC ids; each value is the amount of unscheduled data -# transmitted for that RPC. -unscheduled = {} - -# Keys are RPC ids; each value is a list of lists, one per grant received -# for that RPC, and each entry is an triple pair indicating -# when the grant was received and the range of bytes it covers. -in_grants = {} - -# Keys are RPC ids; each value is a list of lists, one per data packet -# sent for that RPC, and each entry is an pair describing -# that data packet. -out_data = {} - -# Keys are RPC ids; each value is the first time at which we noticed that -# this RPC is transmitting data. -first_out = {} - -for line in f: - # Collect info about outgoing grants (including implicit grants - # for unscheduled bytes) - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'Incoming message for id ([0-9.]+) has ([0-9.]+) unscheduled', line) - if match: - time = float(match.group(1)) - id = int(match.group(4)) - offset = int(match.group(5)) - last_grant[id] = offset - out_grants[id] = [] - # print("%9.3f: unscheduled 'grant' for id %d, offset %d" % ( - # time, id, offset)) - - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'sending grant for id ([0-9.]+), offset ([0-9.]+)', line) - if match: - time = float(match.group(1)) - id = int(match.group(4)) - offset = int(match.group(5)) - if id in last_grant: - # print("%9.3f: grant for id %d, %d:%d" % (time, id, - # last_grant[id], offset)) - out_grants[id].append([time, last_grant[id], offset]) - last_grant[id] = offset - - # Collect info about incoming data packets - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'incoming data packet, id ([0-9]+), .*, offset ([0-9.]+)', line) - if match: - time = float(match.group(1)) - id = int(match.group(4)) - offset = int(match.group(5)) - - # Update grant latencies - if not id in out_grants: - continue - grants = out_grants[id] - if grants: - grant = grants[0] - if grant[1] < offset: - if verbose: - print("%9.3f: grant lag %.1f us (%9.3f us), id %d, " - "range %d:%d" % (time, time - grant[0], grant[0], - id, grant[1], grant[2])) - latencies.append(time - grant[0]) - grants.pop(0) - - # Collect information about unscheduled data for outgoing RPCs - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'starting copy from user space .* id ([0-9]+), .* unscheduled ([0-9]+)', - line) - if match: - time = float(match.group(1)) - id = int(match.group(4)) - unsched = int(match.group(5)) - unscheduled[id] = unsched - first_out[id] = time - # print("%9.3f: %d unscheduled bytes for id %d" % (time, id, unsched)) - - # Collect info about incoming grants - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'processing grant for id ([0-9]+), offset ([0-9]+)', line) - if match: - time = float(match.group(1)) - id = int(match.group(4)) - offset = int(match.group(5)) - if not id in in_grants: - if not id in out_data: - # The trace doesn't include any outgoing data packets - continue - in_grants[id] = [] - if in_grants[id]: - start = in_grants[id][-1][2] - else: - if not id in unscheduled: - continue - start = unscheduled[id] - if start >= offset: - print("%9.3f: out of order grant for id %d: offset %d followed " - "by offset %d" % (time, id, start, offset)) - continue - in_grants[id].append([time, start, offset]) - # print("%9.3f: incoming grant for id %d, range %d:%d" % ( - # time, id, start, offset)) - - # Collect info about outgoing data packets (and also the packet size) - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'Finished queueing packet: .* id ([0-9]+), offset ([0-9]+), ' - 'len ([0-9]+)', line) - if match: - time = float(match.group(1)) - id = int(match.group(4)) - offset = int(match.group(5)) - size = int(match.group(6)) - if size > packet_size: - packet_size = size - # print("Setting packet size to %d" % (packet_size)) - if not id in out_data: - if offset != 0: - # The trace doesn't include all outgoing data packets - continue - out_data[id] = [] - out_data[id].append([time, offset]) - if not (id in first_out): - first_out[id] = time - # print("%9.3f: outgoing data for id %d, offset %d" % ( - # time, id, offset)) - -# Get statistics about the time from first data packet to first -# incoming grant -first_grants = [] -for id in out_data: - if not ((id in in_grants) and in_grants[id]): - continue - delay = in_grants[id][0][0] - out_data[id][0][0] - first_grants.append(delay) - # print("Grant lag for id %d: %.3f us (ip_queue_xmit %.3f, " - # "grant received %.1f" % (id, delay, out_data[id][0][0], - # in_grants[id][0][0])) - -# Time to transmit a full-size packet, in microseconds. -xmit_time = (packet_size * 8)/(gbps * 1000) -print("Largest observed incoming packet: %d bytes" % (packet_size)) -print("Wire serialization time for %d-byte packet at %d Gbps: %.1f us" % ( - packet_size, gbps, xmit_time)) - -# Collect info for all incoming grants about how much additional data -# is authorized by each grant. -in_deltas = [] -for key in in_grants: - rpc_grants = in_grants[key] - for grant in rpc_grants: - in_deltas.append(grant[2] - grant[1]) - -# Compute lag in incoming grants (when the grant arrives relative to -# when we need it). For this, we only consider second and later grants -# for an RPC (assume the first one may be delayed by SRPT). -in_lags = [] -total_lag = 0 -for id in out_data: - if not id in in_grants: - continue - data = out_data[id] - grants = in_grants[id] - # For each grant, find the last data packet that could be sent - # without needing that grant - d = 0 - prev_data_time = 0 - for g in range(1, len(in_grants[id])): - grant = grants[g] - grant_start = grant[1] - time = grant[1] - if d >= len(data): - print("Ran out of data packets for id %d" % (id)) - break - while (data[d][1] < grant_start) and (d < (len(data)-1)): - prev_data_time = data[d][0] - d += 1 - if data[d][1] < grant_start: - break - lag = grant[0] - prev_data_time - xmit_time - in_lags.append(lag) - if (lag > 0): - total_lag += lag - # print("%9.3f: grant offset %d arrived for id %d, data time %9.3f" % ( - # grant[1], grant_start, id, prev_data_time)) - -# Compute total amount of time during which at least one RPC was actively -# transmitting. -xmit_active_time = 0 -start_times = [] -end_times = [] -for id in out_data: - start_times.append(first_out[id]) - end_times.append(out_data[id][-1][0]) -start_times = sorted(start_times) -end_times = sorted(end_times) -num_active = 0 -active_start = 0 -while (len(start_times) > 0) or (len(end_times) > 0): - if len(start_times) > 0: - if (len(end_times) == 0) or (start_times[0] < end_times[0]): - if num_active == 0: - active_start = start_times[0] - num_active += 1 - start_times.pop(0) - continue - num_active -= 1 - if num_active == 0: - xmit_active_time += end_times[0] - active_start - end_times.pop(0) - -latencies = sorted(latencies) -first_grants = sorted(first_grants) -in_lags = sorted(in_lags) -print("\nLatency: time from sending grant for an incoming message") -print(" (in homa_send_grants) to receiving first granted") -print(" data in Homa SoftIRQ") -print("First Lag: time from calling ip_queue_xmit for first data packet") -print(" until receiving first grant in Homa SoftIRQ") -print("In Lag: time when a grant arrived, relative to time when") -print(" it was needed to send message at full bandwidth") -print(" (skips first grant for each message)") -print("Pctile Latency First Lag In Lag") -for p in [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 99, 100]: - print("%3d %s %s %s" %(p, - percentile(latencies, p, "%6.1f us", " N/A "), - percentile(first_grants, p, "%6.1f us", " N/A "), - percentile(in_lags, p, "%6.1f us", " N/A "))) - -if len(latencies) == 0: - out_avg = " N/A " -else: - out_avg = "%6.1f us" % (sum(latencies)/len(latencies)) -if len(first_grants) == 0: - in_avg = " N/A " -else: - in_avg = "%6.1f us" % (sum(first_grants)/len(first_grants)) -if len(in_lags) == 0: - in_lags_avg = " N/A " -else: - in_lags_avg = "%6.1f us" % (sum(in_lags)/len(in_lags)) -print("Average: %9s %9s %9s" % (out_avg, in_avg, in_lags_avg)) - -if xmit_active_time != 0: - print("\nTotal data packet xmit delays because grants were slow:\n" - "%.1f us (%.1f%% of xmit active time)" % ( - total_lag, 100.0*total_lag/xmit_active_time)) - -in_deltas = sorted(in_deltas) -print("\nSizes of incoming grants (additional authorized data)") -print("Pctile Size") -for p in [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 99, 100]: - print("%3d %8s" %(p, percentile(in_deltas, p, "%d", "N/A"))) - -if len(in_deltas) == 0: - in_avg = "N/A" -else: - in_avg = "%.0f" % (sum(in_deltas)/len(in_deltas)) -print("Average %8s" % (in_avg)) diff --git a/util/ttgrep.py b/util/ttgrep.py index f55d479d..def0c833 100755 --- a/util/ttgrep.py +++ b/util/ttgrep.py @@ -1,25 +1,14 @@ #!/usr/bin/python3 -# Copyright (c) 2019-2022 Stanford University -# -# Permission to use, copy, modify, and distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2019-2022 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ -Scan the time trace data in a log file; find all records containing -a given string, and output only those records. If the --rebase argument -is present, times are offset so the first event is at time 0. If the file -is omitted, standard input is used. -Usage: ttgrep.py [--rebase] string [file] +Scan the time trace data in a log file; find all records whose events +match a given Python regular expression, and output only those records. +If the --rebase argument is present, times are offset so the first event +is at time 0. If the file is omitted, standard input is used. +Usage: ttgrep.py [--rebase] regex [file] """ from __future__ import division, print_function @@ -33,24 +22,25 @@ rebase = False -def scan(f, string): +def scan(f, pattern): """ Scan the log file given by 'f' (handle for an open file) and output - all-time trace records containing string. + all-time trace records that match pattern. """ global rebase startTime = 0.0 prevTime = 0.0 writes = 0 + compiled = re.compile(pattern) for line in f: - match = re.match(' *([0-9.]+) us \(\+ *([0-9.]+) us\) (.*)', + match = re.match(' *([-0-9.]+) us \(\+ *([0-9.]+) us\) (.*)', line) if not match: continue time = float(match.group(1)) interval = float(match.group(2)) event = match.group(3) - if (string not in event) and ("Freez" not in event): + if (not compiled.search(event)) and ("Freez" not in event): continue if startTime == 0.0: startTime = time @@ -71,7 +61,7 @@ def scan(f, string): if len(sys.argv) == 3: f = open(sys.argv[2]) elif len(sys.argv) != 2: - print("Usage: %s [--rebase] string [logFile]" % (sys.argv[0])) + print("Usage: %s [--rebase] regex [logFile]" % (sys.argv[0])) sys.exit(1) scan(f, sys.argv[1]) \ No newline at end of file diff --git a/util/tthoma.py b/util/tthoma.py new file mode 100755 index 00000000..2995be2b --- /dev/null +++ b/util/tthoma.py @@ -0,0 +1,12910 @@ +#!/usr/bin/python3 + +# Copyright (c)2023 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +""" +This script analyzes time traces gathered from Homa in a variety of ways. +Invoke with the --help option for documentation. +""" + +from collections import defaultdict, deque +from functools import cmp_to_key +from glob import glob +import heapq +import itertools +from io import StringIO +import matplotlib +import matplotlib.pyplot as plt +from optparse import OptionParser +import math +from operator import itemgetter +import os +from pathlib import Path +import re +from socket import NI_NUMERICHOST +from statistics import stdev +import string +import sys +import textwrap +import time + +# This global variable holds information about every RPC from every trace +# file; it is created by AnalyzeRpcs. There is a separate entry for the +# client side and the server side of each RPC. Keys are RPC ids, values are +# dictionaries of info about that RPC, with the following elements (some +# elements may be missing if the RPC straddled the beginning or end of the +# timetrace): +# found: Last time when homa_wait_for_message found the RPC +# gro_core: Core that handled GRO processing for this RPC +# gro_data: List of tuples for all incoming +# data packets processed by GRO +# gro_data_pkts: List of packets processed by GRO for this RPC, sorted +# in order of 'gro' +# gro_grant: List of tuples for all incoming +# grant packets processed by GRO. Deprecated: use +# gro_grant_pkts instead +# gro_grant_pkts: List of all incoming grant packets processed by GRO +# handoff: Last time when RPC was handed off to waiting thread +# id: RPC's identifier +# in_length: Size of the incoming message, in bytes, or None if unknown +# ip_xmits: Dictionary mapping from offset to ip_*xmit time for +# that offset. Only contains entries for offsets where +# the ip_xmit record has been seen but not send_data +# node: 'node' field from the trace file where this RPC appeared +# (name of trace file without extension) +# out_length: Size of the outgoing message, in bytes +# peer: Address of the peer host +# queued: Last time when RPC was added to ready queue (no +# waiting threads). At most one of 'handoff' and 'queued' +# will be present. +# resend_rx: List of tuples for all incoming +# RESEND packets +# resend_tx: List of tuples for RESEND packets sent +# for the incoming message +# retransmits: One entry for each packet retransmitted; maps from offset +# to tuple +# softirq_data: List of tuples for all incoming +# data packets processed by SoftIRQ. Deprecated: used +# softirq_data_pkts instead +# softirq_data_pkts: List of all incoming data packets processed by SoftIRQ, +# sorted in order of 'softirq' +# softirq_grant: List of tuples for all incoming +# grant packets processed by SoftIRQ. Deprecated: use +# softirq_grant_pkts instead +# softirq_grant_pkts:List of all incoming grant packets processed by SoftIRQ +# recvmsg_done: Time when homa_recvmsg returned +# rx_live: Range of times [start, end] when the incoming message +# was in the process of being received. Starts when first +# homa_sendmsg is called on sender, ends when home_recvmsg +# returns. Missing if rx not live during trace. +# sendmsg: Time when homa_sendmsg was invoked +# send_data: List of tuples for outgoing +# data packets (length is message data); time is when +# packet was passed to ip*xmit. Deprecated: used +# send_data_pkts instead. +# send_data_pkts: List of outgoing data packets, sorted in order of +# 'xmit'. +# send_grant: List of tuples for outgoing +# grant packets. Deprecated: used send_grant_pkts instead +# send_grant_pkts: List of all outgoing grant packets +# tx_live: Range of times [start, end] when the outgoing message was +# partially transmitted. Starts when homa_sendmsg is called, +# ends when last data packet is transmitted by the NIC. +# Missing if tx isn't live during the trace. +# unsched: # of bytes of unscheduled data in the incoming message +# end: Time when the RPC was passed to homa_rpc_end +# +# The following fields will be present if homa_rpc_log_active_tt was invoked +# when the timetraces were frozen; they reflect the RPC's state at the end +# of the trace. +# stats_time: Time when the information below was recorded +# remaining: # of bytes in the incoming message still to be received +# granted: # of bytes granted for the incoming message +# sent: # of bytes that have been sent for the outgoing message +# as of the end of the trace +# rank: RPC's rank in list of grantable RPCs (0 -> highest +# priority) or -1 if not in grantable list +class RpcDict(dict): + def __missing__(self, id): + new_rpc = {'node': Dispatcher.cur_trace['node'], + 'gro_data': [], + 'gro_data_pkts': [], + 'gro_grant': [], + 'gro_grant_pkts': [], + 'id': id, + 'in_length': None, + 'softirq_data': [], + 'softirq_data_pkts': [], + 'softirq_grant': [], + 'softirq_grant_pkts': [], + 'send_data': [], + 'send_data_pkts': [], + 'send_grant': [], + 'send_grant_pkts': [], + 'ip_xmits': {}, + 'resend_rx': [], + 'resend_tx': [], + 'retransmits': {} + } + self[id] = new_rpc + return new_rpc +rpcs = RpcDict() + +# Largest amount of unscheduled data seen any message; set by AnalyzeRpcs. +max_unsched = 0 + +# This global variable holds information about all of the traces that +# have been read. Maps from the 'node' fields of a trace to a dictionary +# containing the following values: +# file: Name of file from which the trace was read +# line: The most recent line read from the file +# node: The last element of file, with extension removed; used +# as a host name in various output +# first_time: Time of the first event read for this trace +# last_time: Time of the last event read for this trace +# elapsed_time: Total time interval covered by the trace +traces = {} + +# IP address -> node name. Computed by AnalyzeRpcs and AnalyzePackets. +# A single node may be in the table twice, once using the full 4-byte +# IP address, such as "0xa000105", which is available for Homa packets, +# and once using only the low-order two bytes (e.g. "0x0105") which is +# all that is available in TCP packets. +ip_to_node = {} + +# This variable holds information about every data packet in the traces. +# it is created by AnalyzePackets. Packets sent with TSO can turn into +# multiple entries in this dictionary, one for each received packet. Keys +# have the form id:offset where id is the RPC id on the sending side and +# offset is the offset in message of the first byte of the packet. Each +# value is a dictionary containing the following fields (some may not +# be present, depending on which events were present in the traces): +# type: Packet type: always 'data' +# xmit: Time when ip*xmit was invoked +# qdisc_defer: If the packet was deferred by homa_qdisc, gives the +# time when the deferral decision was made. This field +# exists only for packets that were deferred. +# qdisc_xmit: Time when homa_qdisc requeued a packet that was deferred +# because of NIC queue length (only present for deferred +# packets) +# xmit2: qdisc_xmit if it exists, otherwise xmit: a time when Homa +# has decided to transmit the packet (after any Homa queuing). +# nic: Time when packet was handed off to the NIC (if available) +# gro: Time when GRO received the packet +# softirq: Time when homa_softirq processed the packet +# copied: Time when packet data was copied to user space on receiver +# free: Time when skb was freed on receiver +# id: RPC id on the sender +# offset: Offset of the data in the packet within its message +# length: # bytes of message data in the received packet +# tso_length: # bytes of message data in the sent packet. For TSO packets, +# which are divided into multiple segments, only the first +# segment will have this field, and it will give the TSO length +# (before segmentation). This may be an overestimate for the +# last packet of a message if the message size isn't known. +# segments: This field will be present in the first packet of each +# TSO packet (the one with tso_length set); it will be a +# list of all the other segments deriving from the same +# TSO packet. +# msg_length: Total number of bytes in the message, or None if unknown +# pacer: If this field exists it has the value True and it means that +# this is a TSO packet that was transmitted by the pacer +# priority: Priority at which packet was transmitted +# tx_node: Name of node from which the packet was transmitted or empty +# string if unknown (always valid if xmit is present) +# tx_core: Core on which ip*xmit was invoked +# tx_qid: NIC channel on which packet was transmitted +# tx_queue: Hex address of queue corresponding to tx_qid, if known +# rx_node: Name of node on which packet was received or empty string +# if unknown +# gro_core: Core on which homa_gro_receive was invoked +# softirq_core: Core on which SoftIRQ processed the packet +# free_tx_skb: Time when NAPI released the skb on the sender, which can't +# happen until the packet has been fully transmitted. +# retransmits: A list with one entry for each time the packet was +# retransmitted. The entry is a dictionary with the same +# fields as a packet (though many may be omitted). There will +# be an entry "retrans" that gives the time of the trace +# record declaring retransmission. If there are no retransmits, +# this will be an empty list. +class PacketDict(dict): + def __missing__(self, key): + id_str, offset_str = key.split(':') + self[key] = {'type': 'data', 'id': int(id_str), + 'offset': int(offset_str), 'retransmits': [], 'segments': []} + return self[key] +packets = PacketDict() + +# offset -> True for each offset that has occurred in a received data packet; +# filled in by AnalyzePackets and AnalyzeRpcs. +recv_offsets = {} + +# This variable holds information about every grant packet in the traces. +# It is created by AnalyzePackets. Keys have the form id:offset where id is +# the RPC id on the sending side and offset is the offset in message of +# the first byte of the packet. Each value is a dictionary containing +# the following fields: +# type: Packet type: always 'grant' +# xmit: Time when ip*xmit was invoked +# nic: Time when the the packet was handed off to the NIC +# gro: Time when GRO received (the first bytes of) the packet +# gro_core: Core on which homa_gro_receive was invoked +# softirq: Time when homa_softirq processed the packet +# softirq_core: Core on which SoftIRQ processed the packet +# free_tx_skb: Time when NAPI released the skb on the sender, which can't +# happen until the packet has been transmitted. +# tx_qid: NIC channel on which packet was transmitted +# tx_node: Node that sent grant, or empty string if unknown +# rx_node: Node that received grant, or empty string if unknown +# id: Id of the RPC on the node that sent the grant +# offset: Offset specified in the grant +# increment: How much previously ungranted data is covered by this grant; +# 0 if the traces don't contain info about the previous grant +class GrantDict(dict): + def __missing__(self, key): + id_str, offset_str = key.split(':') + self[key] = {'type': 'grant', 'id': int(id_str), + 'offset': int(offset_str), 'increment': 0} + return self[key] +grants = GrantDict() + +# This variable holds information about every TCP packet in the traces. +# It is created by AnalyzePackets. See get_tcp_packet for details on the keys +# used to look up packets. Each value is a dictionary containing the following +# fields: +# type: Packet type: always 'tcp' +# source: Hex string identifying source port for the packet: lower 16 +# bits are port number, upper 16 bits are low-order 16-bits of +# IPv4 address +# dest: Destination port for the packet; same format as source. +# length: # bytes of message data in the received packet +# seq_ack: If length is non-zero then this is the sequence number of +# the first byte of data in the packet; otherwise it is the +# acknowledgment sequence number. +# tso_length: The number of data bytes in the packet (before TSO). This +# field is not present in packets that are generated by TSO. +# segments: This field will be present in the first packet of each +# TSO packet (the one with tso_length set); it will be a +# list of all the other segments deriving from the same +# TSO packet. +# total_length: Total length of the packet, including IP and TCP headers +# xmit: Time when ip*xmit was invoked for the packet +# xmit2: qdisc_xmit if it exists, otherwise xmit: a time when Homa +# has decided to transmit the packet (after any Homa queuing) +# tx_core: Core on which ip*xmit was invoked for the packet +# qdisc_xmit: Time when homa_qdisc requeued a packet that was deferred +# because of NIC queue length (only present for deferred +# packets) +# nic: Time when the the packet was handed off to the NIC +# free_tx_skb: Time when NAPI released the skb on the sender, which can't +# happen until the packet has been fully transmitted. +# gro: Time when GRO received the packet +# softirq: Time when SoftIRQ received the packet +# tx_node: Node that sent the packet (corresponds to saddr), or empty +# string if unknown +# tx_qid: NIC channel on which packet was transmitted +# rx_node: Node that received the packet (corresponds to daddr), or empty +# string if unknown +# retransmits: Always empty (for compatibility with Homa packets) +tcp_packets = {} + +# This variable holds information about every identifiable RPC sent via +# TCP in the traces. It is created by the tcp_rpcs analyzer. Keys are +# unique identifiers for RPCs in the form 'client server req_seq' with +# the meanings described below. Each value is a dictionary with the following +# entries: +# client: Client port: hex string whose low 16 bits are the port number, +# high 16 bits are the low-order bits of the client's IP address +# server: Server port (same format as client) +# slot: Slot allocated by cp_node on the client for the message; +# used to differentiate concurrent RPCs between the same +# client and server ports +# req_send: Time when tcp_sendmsg was invoked for the first bytes of +# the request +# req_seq: Sequence number of the first byte of the request message +# req_length: Size of the request message in bytes +# req_end_seq: req_seq + req_length +# req_pkts: List of request packets, in the order sent. Includes only +# data packets +# req_recvd: Time when tcp_recvmsg returned on the server +# resp_send: Time when tcp_sendmsg was invoked for the first bytes of +# the response +# resp_seq: Sequence number of the first byte of the response message +# resp_length: Size of the response message in bytes +# resp_end_seq: resp_seq + resp_length +# resp_pkts: List of response packets, in the order sent. Includes only +# data packets +# resp_recvd: Time when tcp_recvmsg returned on the client (RPC complete) +tcp_rpcs = {} + +# Node -> list of intervals for that node. Created by the intervals analyzer. +# Each interval contains information about a particular time range, including +# things that happened during that time range and the state of the node at +# the end of the period. The list entry for each interval is a dictionary with +# the following fields: +# time: Ending time of the interval (integer usecs); this time is +# included in the interval +# rpcs_live: Number of live RPCs for which this node is the client +# tx_live_req: Number of outgoing request messages with unsent data as +# of the end of the interval +# tx_live_resp: Number of outgoing response messages with unsent data as +# of the end of the interval +# tx_starts: Number of new outgoing messages that started in the interval +# tx_pkts: Number of data packets passed to ip*xmit during the interval +# (or requeued by homa_qdisc after deferral) +# tx_bytes: Number of bytes of data passed to ip*xmit during the interval +# (or requeued by homa_qdisc after deferral) +# tx_nic_pkts: Number of data packets passed to the NIC during the interval +# tx_nic_bytes: Number of bytes of data passed to the NIC during the interval +# tx_in_nic: Number of bytes of data that have been passed to the NIC +# but not yet returned via the tx completion queue, as of the +# end of the interval +# tx_in_nic2: Same as tx_in_nic except excludes bytes that have been +# received at target; provides a tighter (over)estimate of +# "bytes that are queued in the NIC but have not yet been +# transmitted" +# pkts_in_nic2: The number of packets associated with tx_in_nic2 +# tx_in_nic_qx: Same as tx_in_nic2 except only counts bytes in a particular +# tx queue (given by the --tx-qid option, default 0). +# tx_nic_rx: Number of bytes of data that have been received by the +# destination but their packet buffers haven't been returned +# from the NIC via the completion queue, as of the end of +# the interval +# tx_qdisc: Bytes of data that have been passed to ip*xmit but not +# yet passed to the NIC, as of the end of the interval (large +# numbers probably due to qdisc) +# tx_q: Estimate of the number of unsent bytes in the NIC (based +# on when packets passed to the NIC if available, otherwise +# when passed to ip*xmit) +# tx_gro_bytes Bytes of data from this node received by GRO on other nodes +# during the interval +# tx_free_bytes: Bytes of data freed after NIC notified tx completion +# tx_max_free: Largest value of pkt['free_tx_skb'] - pkt['nic'] for a +# packet passed to NIC in this interval (0 if no packets freed) +# tx_min_free: Smallest value of pkt['free_tx_skb'] - pkt['nic'] for a +# packet passed to NIC in this interval (0 if no packets freed) +# tx_max_gro_free:Largest value of pkt['gro'] - pkt['free_tx_skb'] for any +# segment of a packet passed to NIC in this interval (None if +# no packets freed) +# tx_min_gro_free:Smallest value of pkt['gro'] - pkt['free_tx_skb'] for any +# segment of a packet passed to NIC in this interval (None if +# no packets freed) +# tx_grant_xmit: Bytes of grant that have been passsed to ip*xmit but not yet +# received by GRO, as of the end of the interval +# tx_grant_gro: Bytes of grant that have been received by GRO but not SoftIRQ, +# as of the end of the interval +# tx_grant_avl: Bytes of grant that have been received by SoftIRQ but not +# yet transmitted, as of the end of the interval +# tx_new_grants: Bytes of grants that became available at SoftIRQ level +# during the interval +# +# rx_starts: Number of new incoming messages whose first packet was +# transmitted by the sender in the interval +# rx_live: Number of incoming messages that have been partially +# transmitted and/or received as of the end of the interval +# rx_pkts: Number of data packets received by GRO during the interval +# rx_bytes: Number of bytes of data received by GRO during the interval +# rx_grantable: Number of incoming RPCs that have not been fully granted +# rx_granted: Bytes of grant that have been transmitted, but for which +# corresponding data has not been transmitted +# rx_data_xmit: Number of bytes of data that have been passed to ip*xmit by +# the sender but not yet received by GRO, as of the end of +# the interval +# rx_data_qdisc: Number of bytes of data that have been passed to ip*xmit +# by the sender but haven't actually been handed off to +# the NIC (they are queued in qdisc). +# rx_overdue: Number of bytes of data that were transmitted by the NIC +# at least 100 us ago but haven't yet been seen by GRO +# rx_data_gro: Number of bytes of data that have been received by GRO +# but not SoftIRQ, as of the end of the interval +# rx_new_grants: Number of bytes of additional grants passed to ip*xmit +# during the interval +# +# The following fields are present only if the buffers analyzer is used. +# They count bytes of incoming packet data (including headers) for this node +# that are queued somewhere in the network between 'nic' and 'gro'. +# q_homa_unsched: Bytes from unscheduled Homa packets +# q_homa_sched: Bytes from scheduled Homa packets +# q_homa_grant: Bytes from Homa grant packets +# q_tcp: Bytes from TCP packets +# +# The following fields are present only if the grants analyzer is used: +# rx_grants: Number of incoming RPCs with outstanding grants +# rx_grant_bytes: Total bytes of data in outstanding grants for incoming RPCs +# rx_grant_info: Formatted text describing incoming RPCs with oustanding grants +# as of the end of the interval +# tx_grant_info: Formatted text describing outgoing RPCs with available grants +# as of the end of the interval +intervals = None + +# Node (src) -> dictionary of +# Node(dst) -> minimum observed latency (from packet "nic" to "gro") +# among all data and grant packets sent from src to dst. +# A value of math.inf means there were no packets between +# the hosts. +# This structure is created only if the "minlatency" analyzer is active. +min_latency = {} + +# Dispatcher used to parse the traces. +dispatcher = None + +# Total bytes in an IPv4 header +ipv4_hdr_length = 20 + +# Total header bytes in a Homa data packet, including Homa header and +# IP header. +data_hdr_length = 60 + ipv4_hdr_length + +# Total bytes in a Homa grant packet, including IP header (assume IPv4). +grant_pkt_length = 33 + ipv4_hdr_length + +# Total header bytes in a TCP packet (TCP and IP headers, assuming IPv4). +tcp_hdr_length = ipv4_hdr_length + 20 + +# Various color values for plotting: +color_red = '#c00000' +color_blue = '#1f77b4' +color_brown = '#844f1a' +color_green = '#00b050' + +def add_to_intervals(node, start, end, key, delta): + """ + Find all of the intervals for node whose end times overlap the range + (start, end], then add delta to the key entry for each of those intervals. + """ + + global intervals, options + node_intervals = intervals[node] + t0 = node_intervals[0]['time'] - options.interval + if start < t0: + first = 0 + else: + first = int((start - t0)/options.interval) + for interval in node_intervals[first:]: + if interval['time'] > end: + break + interval[key] += delta + +def bytes_to_usec(bytes): + """ + Compute how long many microseconds it takes to transmit a given number of + bytes, assuming a network speed equal to the --gbps option. + """ + global options + + return (bytes*8) / (options.gbps * 1000) + +def cmp_pkts(p1, p2, field): + """ + Compute a sorting order among packets to print, using a given field + of the packets (which contains a time value), which may not be present. + """ + + if field in p1: + if field in p2: + return p1[field] - p2[field] + else: + return 1 + elif field in p2: + return -1 + if p1['id'] != p2['id']: + return p1['id'] - p2['id'] + return p1['offset'] - p2['offset'] + +def dict_avg(data, key): + """ + Given a list of dictionaries, return the average of the elements + with the given key. + """ + count = 0 + total = 0.0 + for item in data: + if (key in item) and (item[key] != None): + total += item[key] + count += 1 + if not count: + return 0 + return total / count + +def div_safe(num, denom): + if denom != 0: + return num/denom + else: + return 0 + +def list_avg(data, index): + """ + Given a list of lists, return the average of the index'th elements + of the lists. + """ + if len(data) == 0: + return 0 + total = 0 + for item in data: + total += item[0] + return total / len(data) + +def extract_num(s): + """ + If the argument contains an integer number as a substring, + return the number. Otherwise, return None. + """ + match = re.match('[^0-9]*([0-9]+)', s) + if match: + return int(match.group(1)) + return None + +def filter_rpcs(rpcs, msglen=None, rpc_start=None, rtt=None): + """ + Returns a list of all the Homa RPCs that match a set of command-line + options + rpcs: List of RPCs to filter (must be entries in rpcs); only + client-side RPCs are considered + msglen: If not None, filter on msglen (see --msglen arg) + rpc_start: If not None, filter on RPC start time (see --rpc-start arg) + rtt: If not None, filter on round-trip time (see --rtt arg) + """ + if msglen != None: + min_length, max_length = get_range(msglen, + option_name='--msglen', one_value=True) + if max_length == None: + max_length = min_length + min_length = 0 + if rpc_start != None: + min_start, max_start = get_range(rpc_start, + parse_float=True, option_name='--rpc-start') + if rtt != None: + min_rtt, max_rtt = get_range(rtt, parse_float = True, + option_name='--rtt') + + result = [] + for rpc in rpcs: + if rpc['id'] & 1: + continue + if msglen != None: + if not 'out_length' in rpc: + continue + length = rpc['out_length'] + if length < min_length or length > max_length: + continue + if rpc_start != None: + if not 'sendmsg' in rpc: + continue + start = rpc['sendmsg'] + if start < min_start or start > max_start: + continue + if rtt != None: + if not 'sendmsg' in rpc or not 'recvmsg_done' in rpc: + continue + rtt = rpc['recvmsg_done'] - rpc['sendmsg'] + if rtt < min_rtt or rtt > max_rtt: + continue + result.append(rpc) + return result + +def filter_tcp_rpcs(rpcs, msglen=None, rpc_start=None, rtt=None): + """ + Returns a list of all the TCP RPCs that match a set of command-line + options + rpcs: List of TCP RPCs to filter (must be entries in tcp_rpcs) + msglen: If not None, filter on msglen (see --msglen arg) + rpc_start: If not None, filter on RPC start time (see --rpc-start arg) + rtt: If not None, filter on round-trip time (see --rtt arg) + """ + if msglen != None: + min_length, max_length = get_range(msglen, + option_name='--msglen', one_value=True) + if max_length == None: + max_length = min_length + min_length = 0 + if rpc_start != None: + min_start, max_start = get_range(rpc_start, + parse_float=True, option_name='--rpc-start') + if rtt != None: + min_rtt, max_rtt = get_range(rtt, parse_float = True, + option_name='--rtt') + + result = [] + for rpc in rpcs: + if msglen != None: + if not 'req_length' in rpc: + continue + length = rpc['req_length'] + if length < min_length or length > max_length: + continue + if rpc_start != None: + if not 'req_send' in rpc: + continue + start = rpc['req_send'] + if start < min_start or start > max_start: + continue + if rtt != None: + if not 'req_send' in rpc or not 'resp_recvd' in rpc: + continue + rtt = rpc['resp_recvd'] - rpc['req_send'] + if rtt < min_rtt or rtt > max_rtt: + continue + result.append(rpc) + return result + +def gbps(bytes, usecs): + """ + Compute the data rate in Gbps for data transmitted or received in + an interval. + + bytes: Number of bytes transferred + usecs: Amount of time (microseconds) during which the transfer happened + """ + global options + + return ((bytes*8)/usecs)*1e-3 + +def get_first_interval_end(node=None): + """ + Used when writing out data at regular intervals during the traces. + Returns the end time of the first interval that contains any trace data. + + node: Name of a node: if specified, returns the first interval that + contains data for this node; otherwise returns the first interval + that contains data for any node + """ + global traces, options + + if node == None: + start = get_first_time() + else: + start = traces[node]['first_time'] + interval_end = int(start)//options.interval * options.interval + if interval_end < start: + interval_end += options.interval + return interval_end + +def get_first_end(): + """ + Return the earliest time at which any of the traces ends (i.e. the last + time that is present in all of the trace files). + """ + earliest = 1e20 + for trace in traces.values(): + last = trace['last_time'] + if last < earliest: + earliest = last + return earliest + +def get_first_time(): + """ + Return the earliest event time across all trace files. + """ + earliest = 1e20 + for trace in traces.values(): + first = trace['first_time'] + if first < earliest: + earliest = first + return earliest + +def get_granted(rpc, time): + """ + Returns the offset of the last grant sent for an RPC as of a given time, + or None if no data available. + """ + max_offset = -1 + for pkt in rpc['send_grant_pkts']: + t = pkt['xmit'] + offset = pkt['offset'] + if (t < time) and (offset > max_offset): + max_offset = offset + if max_offset >= 0: + return max_offset + return None + +def get_hdr_length(pkt, tx=True): + """ + Returns the total amount of header data for a packet (i.e. everything + except message data). + pkt: A packet (either Homa data, Homa grant, or TCP) + tx: If true, compute the total headers for the transmitted packet, + which can include multiple segments for TSO frames. If false, + compute the header for the (single) received packet. + """ + global data_hdr_length, grant_pkt_length, tcp_hdr_length + + pkt_type = pkt['type'] + if pkt_type == 'data': + return data_hdr_length * (1 + tx * len(pkt['segments'])) + elif pkt_type == 'tcp': + return tcp_hdr_length * (1 + tx * len(pkt['segments'])) + elif pkt_type == 'grant': + return grant_pkt_length + else: + return 0 + +def get_interval(node, usecs): + """ + Returns the interval dictionary corresponding to the arguments. A + new interval is created if the desired interval doesn't exist. Returns None + if the interval ends before the first trace record for the node or starts + after the last record for the node, or if there is no interval + information associated with node. The intervals structure must have been + initialized before this function is called. + + node: Name of the desired node + usecs: Time whose interval is desired. Intervals include their + ending time but not their starting time + """ + global intervals, options, traces + + if not node in intervals: + return None + data = intervals[node] + interval_length = options.interval + first_end = data[0]['time'] + # Adjust time down slightly to ensure that even intervals (e.g. 500.0) + # end up in the right place. + i = int((usecs - 1e-5 - (first_end - interval_length)) / interval_length) + if i < 0 or i >= len(data): + return None + return data[i] + +def get_last_start(): + """ + Return the latest time at which any of the traces begins (i.e. the first + time that is present in all of the trace files). + """ + latest = -1e20 + for trace in traces.values(): + first = trace['first_time'] + if first > latest: + latest = first + return latest + +def get_last_time(): + """ + Return the latest event time across all trace files. + """ + latest = -1e20 + for trace in traces.values(): + last = trace['last_time'] + if last > latest: + latest = last + return latest + +def get_max_gro(pkt): + """ + If pkt is a TSO packet that was divided into multiple segments, returns + the largest 'gro' from any segment. Otherwise returns pkt['gro'] if it + exists (result could be None). + """ + max_gro = 1e-20 + if 'gro' in pkt: + max_gro = pkt['gro'] + if 'segments' in pkt: + for seg in pkt['segments']: + if 'gro' in seg: + max_gro = max(max_gro, seg['gro']) + if max_gro == 1e-20: + return None + return max_gro + +def get_mtu(): + """ + Returns the amount of message data in a full-size network packet (as + received by the receiver; GSO packets sent by senders may be larger). + """ + + # Use get_recv_length to do all of the work. + get_recv_length(0) + return get_recv_length.mtu + +def get_packet(id, offset): + """ + Returns the entry in packets corresponding to id and offset. + """ + global packets + return packets['%d:%d' % (id, offset)] + +def get_range(s, option_name=None, parse_float=False, one_value=True): + """ + Parse a range defined by two endpoints and return the endpoints as a list. + s: The input string to parse; may contain either one or + two values + option_name: If specified, contains the name of the option that was + specified as range; used for error messages + parse_float: True means parse values as floating-point numbers; False + means integers + one_value: True means it is OK for s to contain only one value, in + which case it is returned as the lower end of the range, + with None as the upper end. + """ + + values = s.split() + if len(values) == 1: + if not one_value: + if option_name != None: + raise Exception('Bad %s spec \'%s\'; must contain two values' % + (option_name, s)) + else: + raise Exception('Bad range spec \'%s\'; must contain two values' + % (s)) + min = float(values[0]) if parse_float else int(values[0]) + return [min, None] + if len(values) == 2: + min = float(values[0]) if parse_float else int(values[0]) + max = float(values[1]) if parse_float else int(values[1]) + return [min, max] + if option_name != None: + raise Exception('Bad %s spec \'%s\'; must be \'value\' or ' + '\'value1 value2\'' % (option_name, s)) + else: + raise Exception('Bad range spec \'%s\'; must be \'value\' or ' + '\'value1 value2\'' % (s)) + +def get_recv_length(offset, msg_length=None): + """ + Compute the length of a received packet. Uses information collected in the + recv_offsets global variable, and assumes that all messages use the same + set of offsets. + + offset: Offset of the first byte in the packet. + msg_length: Total number of bytes in the message, if known. If not + supplied, then the last packet in a message may have its + length overestimated. + """ + global recv_offsets + if len(get_recv_length.lengths) != len(recv_offsets): + # Must recompute lengths (new offsets have appeared) + get_recv_length.lengths = {} + sorted_offsets = sorted(recv_offsets.keys()) + max = 0 + for i in range(len(sorted_offsets)-1): + length = sorted_offsets[i+1] - sorted_offsets[i] + if length > max: + max = length + get_recv_length.lengths[sorted_offsets[i]] = length + get_recv_length.lengths[sorted_offsets[-1]] = max + get_recv_length.mtu = max + if offset in get_recv_length.lengths: + length = get_recv_length.lengths[offset] + else: + length = get_recv_length.mtu + if msg_length != None: + if ((offset + length) > msg_length) or (length == 0): + length = msg_length - offset + return length + +def get_received(rpc, time): + """ + Returns the offset of the byte just after the last one received by + SoftIRQ for an RPC as of a given time, or None if no data available. + Assumes that the rpc analyzer has run. + """ + max_recv = -1 + for pkt in rpc['softirq_data_pkts']: + t = pkt['softirq'] + recv = pkt['offset'] + pkt['length'] + if (t < time) and (recv > max_recv): + max_recv = recv + if max_recv >= 0: + return max_recv + + # No packets have been received by SoftIRQ + if ('recvmsg_done' in rpc) and (rpc['recvmsg_done'] < time): + return rpc['in_length'] + # If there are GRO packets, assume last SoftIRQ packet is the one + # just before the first GRO packet. + min_offset = 1e20 + for pkt in rpc['gro_data_pkts']: + offset = pkt['offset'] + if offset < min_offset: + min_offset = offset + if (min_offset < 1e20) and (min_offset > 0): + return min_offset + return None + +# offset -> max packet length for that offset. +get_recv_length.lengths = {} +# Maximum length for any offset. +get_recv_length.mtu = 0 + +def get_rpc_node(id): + """ + Given an RPC id, return the name of the node corresponding + to that id, or an empty string if a node could not be determined. + """ + global rpcs, traces + if id in rpcs: + return rpcs[id]['node'] + if id^1 in rpcs: + rpc = rpcs[id^1] + if 'peer' in rpc: + return ip_to_node[rpc['peer']] + return '' + +def get_sorted_nodes(): + """ + Returns a list of node names ('node' value from traces), sorted + by node number if there are numbers in the names, otherwise + sorted alphabetically. + """ + global traces + + # We cache the result to avoid recomputing + if get_sorted_nodes.result != None: + return get_sorted_nodes.result + + # First see if all of the names contain numbers. + nodes = traces.keys() + got_nums = True + for node in nodes: + if extract_num(node) == None: + got_nums = False + break + if not got_nums: + get_sorted_nodes.result = sorted(nodes) + else: + get_sorted_nodes.result = sorted(nodes, key=lambda name : extract_num(name)) + return get_sorted_nodes.result +get_sorted_nodes.result = None + +def get_tcp_node(addr_port): + """ + Return the name of the node corresponding to the argument, or None + if no corresponding node could be found. + addr_port: A hex string used in TCP timetrace entries: the lower + 16 bits are a port number and the upper 16 bits are + the low 16 bits of a node's IP address. + """ + global ip_to_node + + key = addr_port[:-4] + if key in ip_to_node: + return ip_to_node[key] + return None + +def get_tcp_packet(source, dest, data_bytes, seq_ack): + """ + Returns the entry in tcp_packets corresponding to the arguments. Creates + a new packet if it doesn't already exist. + + source: Hex string identifying source for packet; lower 16 bits are + port number, upper 16 bits are low-order 16-bits of IP address + dest: Hex string identifying destination for packet; same format + as source + data_bytes: Amount of payload data in the packet + seq_ack: Packet sequence number if data_bytes != 0, otherwise + ack sequence number from packet + """ + global tcp_packets + + # Distinguish data packets (those where data_bytes is nonzero) from + # packets that are purely acknowledgments (data_bytes is zero). + if data_bytes != 0: + key = f'{source} {dest} {seq_ack} data' + else: + key = f'{source} {dest} {seq_ack} ack' + if key in tcp_packets: + return tcp_packets[key] + pkt = {'type': 'tcp', 'source': source, 'dest': dest, 'seq_ack': seq_ack, + 'retransmits': [], 'segments': [], 'tx_node': '', 'rx_node': ''} + tcp_packets[key] = pkt + return pkt + +def get_time_stats(samples): + """ + Given a list of elapsed times, returns a string containing statistics + such as min time, P99, and average. + """ + if not samples: + return 'no data' + sorted_data = sorted(samples) + average = sum(sorted_data)/len(samples) + return 'Min %.1f, P50 %.1f, P90 %.1f, P99 %.1f, Avg %.1f' % ( + sorted_data[0], + sorted_data[50*len(sorted_data)//100], + sorted_data[90*len(sorted_data)//100], + sorted_data[99*len(sorted_data)//100], + average) + +def get_xmit_time(offset, rpc, rx_time=1e20): + """ + Returns the time when a given offset was transmitted by an RPC. If + there is not a precise record of this, estimate the time based on other + packets sent for the RPC. If we couldn't even make a reasonable estimate, + then None is returned. + + offset: Offset within the outgoing message for rpc + rpc: An entry in the global variable "rpcs" + rx_time: Time when the packet was received; omit if unknown + """ + + xmit = rx_time + fallback = None + for pkt_time, pkt_offset, length in rpc['send_data']: + if offset < pkt_offset: + if (fallback == None) and (pkt_time < rx_time): + # No record so far for the desired packet; use the time from + # the next packet as a fall-back. + fallback = pkt_time + elif offset < (pkt_offset + length): + if (pkt_time < rx_time): + xmit = pkt_time + if pkt_time >= rx_time: + break + if xmit == 1e20: + return fallback + return xmit + +def pkt_id(id, offset): + return '%d:%d' % (id, offset) + +def plot_ccdf(data, file, fig_size=(8,6), title=None, size=10, + y_label="Cumulative Fraction", x_label="Delay (usecs)"): + """ + Generate a complementary CDF with log-scale y-axis. + + data: X-values for the graph, in ascending order + file: File in which to write the plot. + """ + + plt.figure(figsize=fig_size) + if title != None: + plt.title(title, size=size) + if x_label: + plt.xlabel(x_label, size=size) + plt.yscale("log") + plt.ylim(1e-3, 1.0) + if y_label: + plt.ylabel(y_label, size=size) + plt.tick_params(top=True, which="both", direction="in", labelsize=size, + length=5) + plt.grid(which="major", axis="x") + plt.grid(which="major", axis="y") + l = len(data) + y = [(l - i)/l for i in range(0, l)] + plt.plot(data, y, color=color_blue) + left, right = plt.xlim() + print('left: %s, right: %s' % (left, right)) + plt.xlim(left=0, right=right) + plt.tight_layout() + plt.savefig(file) + +def print_analyzer_help(): + """ + Prints out documentation for all of the analyzers. + """ + + global options + analyzers = options.analyzers.split() + module = sys.modules[__name__] + for attr in sorted(dir(module)): + if not attr.startswith('Analyze'): + continue + object = getattr(module, attr) + analyzer = attr[7].lower() + attr[8:] + if (options.analyzers != 'all') and (not analyzer in analyzers): + continue + if hasattr(object, 'output'): + print('%s: %s' % (analyzer, object.__doc__)) + +def print_field_if(dict, field, fmt, modifier=None): + """ + Format a given field in a dictionary, if it is present. If the field + isn't present, return an empty string. + dict: Dictionary containing the desired field. + field: Name of field within dictionary. + fmt: Format string (e.g. %7.1f) to apply to the field, if it is + present. + modifier: If specified, this is a lambda that is applied to the field + to modify its value before formatting. + """ + if field in dict: + value = dict[field] + if modifier != None: + value = modifier(value) + return fmt % (value) + return '' + +def print_if(value, fmt, modifier=None): + """ + Format a value if it isn't None, otherwise return an empty string. + value: Value to format. + fmt: Format string (e.g. %7.1f) to apply to the value, if it is + not None. + modifier: If specified, this is a lambda that is applied to the field + to modify its value before formatting. + """ + if value != None: + if modifier != None: + value = modifier(value) + return fmt % (value) + return '' + +def print_pctl(values, pctl, fmt): + """ + Return a formatted string describing a given percentile from a list + of values. + values: List of values, sorted from 0th percentile to 100th percentile. + If empty then an empty string is returned. + pctl: Desired percentile, from 0-1000 (e.g. 900 selects P90) + fmt: printf-style formt string containing a single % specifier for + the selected percentile. + """ + if len(values) == 0: + return '' + ix = len(values) * pctl // 1000 + return fmt % (values[ix] if ix < len(values) else values[-1]) + +def print_pkts(pkts, header=True, comment=False): + """ + Returns a string containing one line for each packet in pkts, which + contains various useful information about the packet. + pkts: Packets to print (either Homa packets or TCP packets) + header: If True, the result string will include initial text describing + the fields that are printed on each line. + comment: If True, all of the initial lines except column headers will + be preceded by '# '. + """ + + buf = StringIO() + prefix = '# ' if comment else '' + if header: + buf.write(prefix + 'Source: Node that sent packet\n') + buf.write(prefix + 'Dest: Node to which packet was sent\n') + buf.write(prefix + 'Xmit: Time when packet was passed to ip*xmit\n') + buf.write(prefix + 'Qdisc: Time when homa_qdisc requeued packet ' + 'after deferral, if any\n') + buf.write(prefix + 'Id/Seq: RPC identifier for Homa packets, sequence ' + 'number for TCP\n') + buf.write(prefix + 'Offset: Offset of packet within message or ' + '"TCP" if packet is TCP\n') + buf.write(prefix + 'Length: Size of packet; for the first segment ' + 'generated from a TSO\n') + buf.write(prefix + ' frame this is the size of the TSO ' + 'frame; for other segments\n') + buf.write(prefix + ' it is the size of the received packet\n') + buf.write(prefix + 'Qid: Transmit queue on which packet was sent\n') + buf.write(prefix + 'Nic: Time when packet was queued for NIC\n') + buf.write(prefix + 'NDelay: Nic - Xmit\n') + buf.write(prefix + 'Gro: Time when packet was received by GRO\n') + buf.write(prefix + 'GDelay: Gro - Nic\n') + buf.write(prefix + 'Free: Time when sk_buff was released on ' + 'sender\n') + buf.write(prefix + 'FDelay: Free - Nic\n') + buf.write(prefix + 'Rx: Number of times segments in the packet ' + 'were retransmitted\n\n') + buf.write('Source Dest Xmit Qdisc Id/Seq Offset') + buf.write(' Length Qid Nic NDelay Gro GDelay') + buf.write(' Free FDelay Rx\n') + for pkt in pkts: + xmit = pkt['xmit'] if 'xmit' in pkt else None + if 'qdisc_xmit' in pkt: + qdisc = pkt['qdisc_xmit'] + qdisc_string = '%10.3f' % (qdisc) + else: + qdisc = None + qdisc_string = '' + nic_delay = None + if 'nic' in pkt: + nic = pkt['nic'] + if xmit != None: + nic_delay = nic - xmit + else: + nic = None + gro = pkt['gro'] if 'gro' in pkt else None + free = pkt['free_tx_skb'] if 'free_tx_skb' in pkt else None + qid = pkt['tx_qid'] if 'tx_qid' in pkt else None + length = pkt['tso_length'] if 'tso_length' in pkt else pkt['length'] + + rx = len(pkt['retransmits']) + if 'segments' in pkt: + for seg in pkt['segments']: + rx += len(seg['retransmits']) + rx_msg = str(rx) if rx > 0 else "" + + line = '%-8s %-8s %10s %10s' % (pkt['tx_node'], pkt['rx_node'], + print_if(xmit, '%.3f'), qdisc_string) + if pkt['type'] == 'data': + line += ' %10d %6d' % (pkt['id'], pkt['offset']) + else: + # This is a TCP packet + line += ' %10d TCP' % (pkt['seq_ack']) + nic_delay_string = '' + if nic_delay != None: + nic_delay_string = '%.1f' % (nic_delay) + gro_delay_string = '' + if gro != None and nic != None: + gro_delay_string = '%.1f' % (gro - nic) + line += ' %6d %3s %10s %7s %10s %7s' % (length, + print_if(qid, '%d'), print_if(nic, '%.3f'), nic_delay_string, + print_if(gro, '%.3f'), gro_delay_string) + free_delay_string = '' + if (nic != None) and (free != None): + free_delay_string = '%.1f' % (free - nic) + line += ' %10s %7s %2s' % (print_if(free, '%.3f'), + free_delay_string, rx_msg) + buf.write(line.rstrip()) + buf.write('\n') + return buf.getvalue() + +def print_rpcs(client_rpcs, header=True): + """ + Returns a string containing one line for each RPC in client_rpcs, which + contains various useful statistics about the RPC. The RPCs are all + assumed to be client-side RPCs. If header is True then the string also + includes initial text describing the fields that are printed on each line. + """ + global rpcs + + buf = StringIO() + if header: + buf.write('Start: Time when homa_sendmsg was invoked for request\n') + buf.write('Client: Node that sent the RPC request\n') + buf.write('Server: Node that handled the RPC and sent response\n') + buf.write('Id: RPC identifier (client side)\n') + buf.write('Length: Length of request message\n') + buf.write('RqNic: Elapsed time from sendmsg until first ' + 'request packet handed\n') + buf.write(' off to NIC\n') + buf.write('RqGRO: Time from NIC handoff to GRO receipt for ' + 'first request packet\n') + buf.write('RqSoft: Time from GRO to SoftIRQ for first request ' + 'packet\n') + buf.write('RqRecv: Time from SoftIRQ for first request packet ' + 'until recvmsg completes\n') + buf.write(' on server\n') + buf.write('Srvc: Time from recvmsg return on server until ' + 'sendmsg for response\n') + buf.write('RspNic: Elapsed time from sendmsg of response until ' + 'first packet handed\n') + buf.write(' off to NIC\n') + buf.write('RspGRO: Time from NIC handoff to GRO receipt for ' + 'first response packet\n') + buf.write('RspSoft: Time from GRO to SoftIRQ for first response ' + 'packet\n') + buf.write('RspRecv: Time from SoftIRQ for first response packet ' + 'until RPC completes\n') + buf.write('End: Time when response was returned to client\n') + buf.write('Rtt: End-to-end RTT\n\n') + buf.write('Start Client Server Id Length RqNic RqGRO ') + buf.write('RqSoft RqRecv Srvc RspNic RspGRO ') + buf.write('RspSoft RspRecv End Rtt\n') + for rpc in client_rpcs: + peer_id = rpc['id'] ^ 1 + if peer_id in rpcs: + srpc = rpcs[peer_id] + else: + srpc = {} + tx = rpc['send_data_pkts'][0] if rpc['send_data_pkts'] else {} + rx = rpc['softirq_data_pkts'][0] if rpc['softirq_data_pkts'] else {} + if 'sendmsg' in rpc: + start = '%.3f' % (rpc['sendmsg']) + else: + start = '' + if 'nic' in tx and 'sendmsg' in rpc: + rq_nic = '%.1f' % (tx['nic'] - rpc['sendmsg']) + else: + rq_nic = '' + if 'gro' in tx and 'nic' in tx: + rq_gro = '%.1f' % (tx['gro'] - tx['nic']) + else: + rq_gro = '' + if 'softirq' in tx and 'gro' in tx: + rq_soft = '%.1f' % (tx['softirq'] - tx['gro']) + else: + rq_soft = '' + if 'recvmsg_done' in srpc and 'softirq' in tx: + rq_recv = '%.1f' % (srpc['recvmsg_done'] - tx['softirq']) + else: + rq_recv = '' + if 'sendmsg' in srpc and 'recvmsg_done' in srpc: + srvc = '%.1f' % (srpc['sendmsg'] - srpc['recvmsg_done']) + else: + srvc = '' + if 'nic' in rx and 'sendmsg' in srpc: + rsp_nic = '%.1f' % (rx['nic'] - srpc['sendmsg']) + else: + rsp_nic = '' + if 'gro' in rx and 'nic' in rx: + rsp_gro = '%.1f' % (rx['gro'] - rx['nic']) + else: + rsp_gro = '' + if 'softirq' in rx and 'gro' in rx: + rsp_soft = '%.1f' % (rx['softirq'] - rx['gro']) + else: + rsp_soft = '' + if 'recvmsg_done' in rpc and 'softirq' in rx: + rsp_recv = '%.1f' % (rpc['recvmsg_done'] - rx['softirq']) + else: + rsp_recv = '' + if 'recvmsg_done' in rpc and 'sendmsg' in rpc: + rtt = '%.1f' % (rpc['recvmsg_done'] - rpc['sendmsg']) + else: + rtt = '' + if 'recvmsg_done' in rpc: + end = '%.3f' % (rpc['recvmsg_done']) + else: + end = '' + buf.write('%10s %-8s %-8s %10s %7d %6s %6s' % (start, + rpc['node'], get_rpc_node(peer_id), rpc['id'], rpc['out_length'], + rq_nic, rq_gro)) + buf.write(' %6s %6s %6s %6s %6s' % ( + rq_soft, rq_recv, srvc, rsp_nic, rsp_gro)) + buf.write(' %7s %7s %10s %6s\n' % (rsp_soft, rsp_recv, end, rtt)) + return buf.getvalue() + +def print_tcp_rpcs(rpcs, header=True): + """ + Returns a string containing one line for each RPC in tcp_rpcs, which + contains various useful statistics about the RPC. + rpcs: RPCs to print; must be entries in tcp_rpcs + header: If True then the result will include initial text describing + the fields printed on each line. + """ + buf = StringIO() + if header: + buf.write('Start: Time when tcp_sendmsg was invoked for request\n') + buf.write('Client: Node that sent the RPC request\n') + buf.write('Server: Node that handled the RPC and sent response\n') + buf.write('Length: Length of request message\n') + buf.write('ReqSeq: Sequence number of first byte of request\n') + buf.write('RspSeq: Sequence number of first byte of response\n') + buf.write('ReqXmit: Elapsed time from sendmsg until first ' + 'request packet handed\n') + buf.write(' off to NIC\n') + buf.write('ReqNet: Time from NIC handoff to GRO receipt for ' + 'first request packet\n') + buf.write('ReqSft: Time from GRO for last request packet until ' + 'SoftIRQ for it\n') + buf.write('ReqRecv: Time from SoftIRQ for last request packet ' + 'until recvmsg completes\n') + buf.write(' on server\n') + buf.write('Srvc: Time from recvmsg return on server until ' + 'sendmsg for response\n') + buf.write('RspXmit: Elapsed time from sendmsg of response until ' + 'first packet handed\n') + buf.write(' off to NIC\n') + buf.write('RspNet: Time from NIC handoff to GRO receipt for ' + 'first response packet\n') + buf.write('RspSft: Time from GRO for last response packet until ' + 'SoftIRQ for it\n') + buf.write('RspRecv: Time from SoftIRQ for last response packet ' + 'until End\n') + buf.write('End: Time when response was returned to client\n') + buf.write('Rtt: RspRecv - Start\n\n') + buf.write('Start Client Server Length ReqSeq RspSeq ') + buf.write('ReqXmit ReqNet ReqSft ReqRecv Srvc ') + buf.write('RspXmit RspNet RspSft RspRecv End Rtt\n') + for rpc in rpcs: + if rpc['req_pkts']: + first_req_pkt = rpc['req_pkts'][0] + last_req_pkt = rpc['req_pkts'][-1] + else: + first_req_pkt = [] + last_req_pkt = [] + if rpc['resp_pkts']: + first_resp_pkt = rpc['resp_pkts'][0] + last_resp_pkt = rpc['resp_pkts'][-1] + else: + first_resp_pkt = [] + last_resp_pkt = [] + if 'resp_seq' in rpc: + resp_seq = '%d' % (rpc['resp_seq']) + else: + resp_seq = '' + if 'nic' in first_req_pkt: + rq_xmit = '%.1f' % (first_req_pkt['nic'] - rpc['req_send']) + else: + rq_xmit = '' + if 'gro' in first_req_pkt and 'nic' in first_req_pkt: + rq_net = '%.1f' % (first_req_pkt['gro'] - first_req_pkt['nic']) + else: + rq_net = '' + if 'gro' in last_req_pkt and 'softirq' in last_req_pkt: + rq_soft = '%.1f' % (last_req_pkt['softirq'] - last_req_pkt['gro']) + else: + rq_soft = '' + if 'softirq' in last_req_pkt and 'req_recvd' in rpc: + rq_recv = '%.1f' % (rpc['req_recvd'] - last_req_pkt['gro']) + else: + rq_recv = '' + if 'req_recvd' in rpc and 'resp_send' in rpc: + srvc = '%.1f' % (rpc['resp_send'] - rpc['req_recvd']) + else: + srvc = '' + if 'nic' in first_resp_pkt: + rsp_xmit = '%.1f' % (first_resp_pkt['nic'] - rpc['resp_send']) + else: + rsp_xmit = '' + if 'gro' in first_resp_pkt and 'nic' in first_resp_pkt: + rsp_net = '%.1f' % (first_resp_pkt['gro'] - first_resp_pkt['nic']) + else: + rsp_net = '' + if 'gro' in last_resp_pkt and 'softirq' in last_resp_pkt: + rsp_soft = '%.1f' % (last_resp_pkt['softirq'] - last_resp_pkt['gro']) + else: + rsp_soft = '' + if 'softirq' in last_resp_pkt and 'resp_recvd' in rpc: + rsp_recv = '%.1f' % (rpc['resp_recvd'] - last_resp_pkt['softirq']) + else: + rsp_recv = '' + if 'req_send' in rpc and 'resp_recvd' in rpc: + rtt = '%.1f' % (rpc['resp_recvd'] - rpc['req_send']) + else: + rtt = '' + if 'resp_recvd' in rpc: + end = '%.3f' % (rpc['resp_recvd']) + else: + end = '' + line = ('%10.3f %-8s %-8s %7d %10d %10s' % ( + rpc['req_send'], get_tcp_node(rpc['client']), + get_tcp_node(rpc['server']), rpc['req_length'], + rpc['req_seq'], resp_seq)) + line += (' %7s %6s %6s %7s %6s' % ( + rq_xmit, rq_net, rq_soft, rq_recv, srvc)) + line += (' %7s %6s %6s %7s %10s %7s' % ( + rsp_xmit, rsp_net, rsp_soft, rsp_recv, end, rtt)) + buf.write(line.rstrip()) + buf.write('\n') + return buf.getvalue() + +def require_options(analyzer, *args): + """ + For each argument, ensures that the associated option has been specified; + raises an exception if it hasn't. The analyzer argument gives the name + of the analyzer requiring the options, for use in the exception message. + """ + global options + for arg in args: + if getattr(options, arg) == None: + raise Exception('The %s analyzer requires the --%s option' % ( + analyzer, arg)) + +def set_tcp_ip_node(tcp_endpoint, node): + """ + Add a mapping from IP address to node to the ip_to_node table. + tcp_endpoint: An endpoint spec from a TCP packet. Must be a hex string + (with leading "0x") whose high-order 16 bits are the + low-order 16 bits of an IPv4 address. + node: Name of the node corresponding to tcp_endpoing + """ + key = tcp_endpoint[:-4] + ip_to_node[key] = node + +def sort_pkts(pkts, key): + """ + Sort a list of packets using a given key and return the sorted list. + pkts: Packets to sort + key: Determines sort order (typically the value of the --sort + option); must be 'Xmit', 'Nic', 'Gro', 'SoftIRQ', or 'Free' + """ + + sort_keys = {'Xmit': 'xmit', 'Qdisc': 'qdisc_xmit', 'Nic': 'nic', + 'Gro': 'gro', 'SoftIRQ': 'softirq', 'Free': 'free_tx_skb'} + if not key in sort_keys: + raise Exception('Invalid sort option %s: must be one of %s' % ( + key, sort_keys.keys())) + sort_key = sort_keys[key] + return sorted(pkts, key = lambda pkt : + pkt[sort_key] if sort_key in pkt else 1e20) + +def sum_fields(list, field): + """ + Given a list of dictionaries, return the sum of a given field in each + of the dictionaries. + """ + total = 0 + for d in list: + total += d[field] + return total + +class Dispatcher: + + # Info about trace file currently being parsed, or None if none. + cur_trace = None + + """ + This class manages a set of patterns to match against the records + of a timetrace. It then reads time trace files and passes information + about matching records to other classes that are interested in them. + """ + + def __init__(self): + # List of all objects with registered interests, in order of + # registration. + self.objs = [] + + # Keys are names of all classes passed to the interest method. + # Values are the corresponding objects. + self.analyzers = {} + + # Pattern name -> list of analyzer classes interested in that pattern. + self.interests = {} + + # List of objects with tt_all methods, which will be invoked for + # every record. + self.all_interests= [] + + # Pattern prefix -> list of patterns with that prefix. All of the + # keys have the same length, given by self.prefix_length. Entries + # in each list have the same order that they appear in patterns. + # Setting this to None causes it to be recomputed the next time + # a trace file is read + self.parse_table = None + + # Pattern name -> pattern + self.pattern_dict = {} + + # The number of initial characters of the message portion of a + # trace record that are used to lookup in parse_table. This is + # the largest number such that each pattern has at least this many + # literal initial characters. + self.prefix_length = -1 + + # Total nanoseconds spent parsing trace files so far. + self.parse_ns = 0 + + # Total number of lines parsed from trace files so far. + self.trace_lines = 0 + + # Total number of times regexps were applied to lines of trace + # files (whether they matched or not) + self.regex_tries = 0 + + # Core -> dictionary of saved values for that core. Used in situations + # where it takes multiple time trace entries to provide all the data + # needed for an event: info accumulates here until the last time + # trace entry is seen. + self.core_saved = defaultdict(dict) + + for pattern in self.patterns: + pattern['matches'] = 0 + self.pattern_dict[pattern['name']] = pattern + + def get_analyzer(self, name): + """ + Return the analyzer object associated with name, or None if + there is no such analyzer. + + name: Name of an analyzer class. + """ + + if name in self.analyzers: + return self.analyzers[name] + else: + return None + + def get_analyzers(self): + """ + Return a list of all analyzer objects registered with this + dispatcher + """ + + return self.objs + + def pattern_matched(self, name): + """ + Return True if the pattern with the given name matched at least + one event in the traces, False if it never matched + """ + return self.pattern_dict[name]['matches'] > 0 + + def interest(self, analyzer): + """ + If analyzer hasn't already been registered with this dispatcher, + create an instance of that class and arrange for its methods to + be invoked for matching lines in timetrace files. For each method + named 'tt_xxx' in the class there must be a pattern named 'xxx'; + the method will be invoked whenever the pattern matches a timetrace + line, with parameters containing parsed fields from the line. In any + case, returns analyzer object. + + analyzer: name of a class containing trace analysis code + """ + + if analyzer in self.analyzers: + return self.analyzers[analyzer] + + # This line breaks circular dependency chains. + self.analyzers[analyzer] = None + obj = getattr(sys.modules[__name__], analyzer)(self) + self.analyzers[analyzer] = obj + self.objs.append(obj) + + for name in dir(obj): + if not name.startswith('tt_'): + continue + method = getattr(obj, name) + if not callable(method): + continue + name = name[3:] + if name == 'all': + self.all_interests.append(obj) + continue + name_len = len(name) + for pattern in self.patterns: + # Include all patterns whose names either match the given + # name or consist of the name followed by a number (used for + # situations where it takes multiple timetrace entries to + # supply relevant data). + if not pattern['name'].startswith(name): + continue + if (len(pattern['name']) != name_len and + not pattern['name'][name_len:].isdigit()): + continue + found_pattern = True + if not pattern['name'] in self.interests: + self.interests[pattern['name']] = [] + self.interests[pattern['name']].append(obj) + if not name in self.interests: + raise Exception('Couldn\'t find pattern %s for analyzer %s' + % (name, analyzer)) + return obj + + def parse(self, file): + """ + Parse a timetrace file and invoke interests. + file: Name of the file to parse. + """ + + global traces + start_ns = time.time_ns() + self.__build_parse_table() + prefix_matcher = re.compile(' *([-0-9.]+) us .* \[C([0-9]+)\] (.*)') + + trace = {} + trace['file'] = file + node = Path(file).stem + trace['node'] = node + traces[node] = trace + Dispatcher.cur_trace = trace + + print('Reading trace file %s' % (file), file=sys.stderr) + for analyzer in self.objs: + if hasattr(analyzer, 'init_trace'): + analyzer.init_trace(trace) + + f = open(file) + first = True + for trace['line'] in f: + # Parse each line in 2 phases: first the time and core information + # that is common to all patterns, then the message, which will + # select at most one pattern. + # print('\n%s' % (trace['line'].rstrip())) + self.trace_lines += 1 + self.regex_tries += 1 + match = prefix_matcher.match(trace['line']) + if not match: + continue + t = float(match.group(1)) + core = int(match.group(2)) + msg = match.group(3) + + if first: + trace['first_time'] = t + first = False + trace['last_time'] = t + prefix = msg[0:self.prefix_length] + if prefix in self.parse_table: + for pattern in self.parse_table[prefix]: + self.regex_tries += 1 + # print(' %s' % (pattern['regexp'])) + match = pattern['cregexp'].match(msg) + if match: + pattern['matches'] += 1 + pattern['parser'](trace, t, core, match, + self.interests[pattern['name']]) + break + for interest in self.all_interests: + interest.tt_all(trace, t, core, msg) + f.close() + trace['elapsed_time'] = trace['last_time'] - trace['first_time'] + self.parse_ns += time.time_ns() - start_ns; + Dispatcher.cur_trace = None + + def print_no_matches(self): + """ + Print out information about patterns that didn't match any lines + in any trace file. + """ + no_matches = [] + for patterns in self.parse_table.values(): + for pattern in patterns: + if pattern['matches'] > 0: + continue + no_matches.append(pattern) + if no_matches: + print('No lines matched the following patterns:', file=sys.stderr) + for pattern in no_matches: + print(' %s' % (pattern['regexp']), file=sys.stderr) + + def print_stats(self): + """ + Print statistics about the efficiency of parsing trace files. + """ + print('Trace file lines read: %d' % (self.trace_lines)) + print('Regex matches attempted: %d (%.1f per line)' % ( + self.regex_tries, self.regex_tries/self.trace_lines)) + print('Trace file parse time: %.3f sec' % (self.parse_ns*1e-9)) + print('(%.1f usec/line, %.1f usec/regex attempt)' % ( + ((self.parse_ns/self.trace_lines)*1e-3), + ((self.parse_ns/self.regex_tries)*1e-3))) + sum = 0 + for bucket in self.parse_table.values(): + sum += len(bucket) + print('Parse table has %d patterns in %d buckets' % ( + sum, len(self.parse_table))) + + def __build_parse_table(self): + """ + Builds self.parse_table. Also sets the 'parser' and 'cregexp' elements + for each pattern. + """ + if self.parse_table != None: + return + + # Pattern prefix -> list of patterns with that prefix + self.parse_table = defaultdict(list) + + # Pass 1: first compute self.prefix_length and set the 'parser' + # and 'cregexp' elements of pattern entries. + self.prefix_length = 1000 + for pattern in self.patterns: + meta_matcher = re.compile('[()[\].+*?\\^${}]') + pattern['parser'] = getattr(self, '_Dispatcher__' + pattern['name']) + pattern['cregexp'] = re.compile(pattern['regexp']) + if pattern['name'] in self.interests: + match = meta_matcher.search(pattern['regexp']) + if not match: + length = len(pattern['regexp']) + else: + length = match.start() + if length < self.prefix_length: + self.prefix_length = length + if length == 0: + print('Warning: parse table has only 1 bucket because ' + 'of the following pattern:\n %s' % ( + pattern['regexp']), file = sys.stderr) + + # Pass 2: fill in self.parse_table + for pattern in self.patterns: + if pattern['name'] in self.interests: + prefix = pattern['regexp'][0:self.prefix_length] + self.parse_table[prefix].append(pattern) + + # Each entry in this list represents one pattern that can be matched + # against the lines of timetrace files. Each pattern is a dictionary + # containing the following elements: + # name: Name for this pattern. Used for auto-configuration (e.g. + # methods named tt_ are invoked to handle matching + # lines). + # regexp: Regular expression to match against the message portion + # of timetrace records (everything after the core number). + # For efficient matching, there should be several literal + # characters before any regexp metachars. + # cregexp: Compiled version of regexp. + # matches: Number of timetrace lines that matched this pattern. + # parser: Method in this class that will be invoked to do additional + # parsing of matched lines and invoke interests. + # This object is initialized as the parser methods are defined below. + patterns = [] + + # The declarations below define parser methods and their associated + # patterns. The name of a parser is derived from the name of its + # pattern. Parser methods are invoked when lines match the corresponding + # pattern. The job of each method is to parse the matches from the pattern, + # if any, and invoke all of the relevant interests. All of the methods + # have the same parameters: + # self: The Dispatcher object + # trace: Holds information being collected from the current trace file + # time: Time of the current record (microseconds) + # core: Number of the core on which the event occurred + # match: The match object returned by re.match + # interests: The list of objects to notify for this event + + def __gro_data(self, trace, time, core, match, interests): + peer = match.group(1) + id = int(match.group(2)) + offset = int(match.group(3)) + prio = int(match.group(4)) + for interest in interests: + interest.tt_gro_data(trace, time, core, peer, id, offset, prio) + + patterns.append({ + 'name': 'gro_data', + 'regexp': 'homa_gro_receive got packet from ([^ ]+) id ([0-9]+), ' + 'offset ([0-9.]+), priority ([0-9.]+)' + }) + + def __gro_grant(self, trace, time, core, match, interests): + peer = match.group(1) + id = int(match.group(2)) + offset = int(match.group(3)) + priority = int(match.group(4)) + for interest in interests: + interest.tt_gro_grant(trace, time, core, peer, id, offset, priority) + + patterns.append({ + 'name': 'gro_grant', + 'regexp': 'homa_gro_receive got grant from ([^ ]+) id ([0-9]+), ' + 'offset ([0-9]+), priority ([0-9]+)' + }) + + def __softirq_data(self, trace, time, core, match, interests): + id = int(match.group(1)) + offset = int(match.group(2)) + msg_length = int(match.group(3)) + for interest in interests: + interest.tt_softirq_data(trace, time, core, id, offset, msg_length) + + patterns.append({ + 'name': 'softirq_data', + 'regexp': 'incoming data packet, id ([0-9]+), .*, offset ([0-9.]+)' + '/([0-9.]+)' + }) + + def __softirq_grant(self, trace, time, core, match, interests): + id = int(match.group(1)) + offset = int(match.group(2)) + priority = int(match.group(3)) + increment = int(match.group(4)) + for interest in interests: + interest.tt_softirq_grant(trace, time, core, id, offset, priority, + increment) + + patterns.append({ + 'name': 'softirq_grant', + 'regexp': 'processing grant for id ([0-9]+), offset ([0-9]+), ' + 'priority ([0-9]+), increment ([0-9]+)' + }) + + def __ip_xmit(self, trace, time, core, match, interests): + id = int(match.group(1)) + offset = int(match.group(2)) + for interest in interests: + interest.tt_ip_xmit(trace, time, core, id, offset) + + patterns.append({ + 'name': 'ip_xmit', + 'regexp': 'calling ip.*_xmit: .* id ([0-9]+), offset ([0-9]+)' + }) + + def __send_data(self, trace, time, core, match, interests): + id = int(match.group(1)) + offset = int(match.group(2)) + length = int(match.group(3)) + for interest in interests: + interest.tt_send_data(trace, time, core, id, offset, length) + + patterns.append({ + 'name': 'send_data', + 'regexp': 'Finished queueing packet: rpc id ([0-9]+), offset ' + '([0-9]+), len ([0-9]+)' + }) + + def __send_grant(self, trace, time, core, match, interests): + id = int(match.group(1)) + offset = int(match.group(2)) + priority = int(match.group(3)) + increment = int(match.group(4)) + for interest in interests: + interest.tt_send_grant(trace, time, core, id, offset, priority, + increment) + + patterns.append({ + 'name': 'send_grant', + 'regexp': 'sending grant for id ([0-9]+), offset ([0-9]+), ' + 'priority ([0-9]+), increment ([0-9]+)' + }) + + def __qdisc_queue_data(self, trace, time, core, match, interests): + id = int(match.group(1)) + offset = int(match.group(2)) + qid = int(match.group(3)) + queue = match.group(4) + for interest in interests: + interest.tt_qdisc_queue_data(trace, time, core, id, + offset, qid, queue) + + patterns.append({ + 'name': 'qdisc_queue_data', + 'regexp': '__dev_xmit_skb queueing homa data packet for ' + 'id ([0-9]+), offset ([0-9]+), qid ([0-9]+) \(([^)]+)\)' + }) + + def __nic_data(self, trace, time, core, match, interests): + peer = match.group(2) + id = int(match.group(3)) + offset = int(match.group(4)) + tx_queue = match.group(5) + for interest in interests: + interest.tt_nic_data(trace, time, core, peer, id, offset, tx_queue) + + patterns.append({ + 'name': 'nic_data', + 'regexp': 'sent homa data packet via (mlx|ice) to ([^,]+), id ([0-9]+), ' + 'offset ([0-9]+), queue (0x[0-9a-f]+)' + }) + + def __nic_grant(self, trace, time, core, match, interests): + peer = match.group(2) + id = int(match.group(3)) + offset = int(match.group(4)) + tx_queue = match.group(5) + for interest in interests: + interest.tt_nic_grant(trace, time, core, peer, id, offset, tx_queue) + + patterns.append({ + 'name': 'nic_grant', + 'regexp': 'sent homa grant via (mlx|ice) to ([^,]+), id ([0-9]+), ' + 'offset ([0-9]+), queue (0x[0-9a-f]+)' + }) + + def __free_tx_skb(self, trace, time, core, match, interests): + id = int(match.group(1)) + offset = int(match.group(2)) + qid = int(match.group(3)) + msg_length = int(match.group(4)) + for interest in interests: + interest.tt_free_tx_skb(trace, time, core, id, offset, qid, + msg_length) + + patterns.append({ + 'name': 'free_tx_skb', + 'regexp': 'freeing tx skb for homa data, id ([0-9]+), ' + 'offset ([0-9]+), qid ([0-9]+), msg_length ([0-9]+)' + }) + + def __free_grant(self, trace, time, core, match, interests): + id = int(match.group(1)) + offset = int(match.group(2)) + qid = int(match.group(3)) + for interest in interests: + interest.tt_free_grant(trace, time, core, id, offset, qid) + + patterns.append({ + 'name': 'free_grant', + 'regexp': 'freeing tx skb for homa grant, id ([0-9]+), ' + 'offset ([0-9]+), qid ([0-9]+)' + }) + + def __sendmsg_request(self, trace, time, core, match, interests): + peer = match.group(1) + id = int(match.group(2)) + length = int(match.group(3)) + for interest in interests: + interest.tt_sendmsg_request(trace, time, core, peer, id, length) + + patterns.append({ + 'name': 'sendmsg_request', + 'regexp': 'homa_sendmsg request, target ([^: ]+):.* id ' + '([0-9]+), length ([0-9]+)' + }) + + def __sendmsg_response(self, trace, time, core, match, interests): + id = int(match.group(1)) + length = int(match.group(2)) + for interest in interests: + interest.tt_sendmsg_response(trace, time, core, id, length) + + patterns.append({ + 'name': 'sendmsg_response', + 'regexp': 'homa_sendmsg response, id ([0-9]+), .*length ([0-9]+)' + }) + + def __sendmsg_done(self, trace, time, core, match, interests): + id = int(match.group(1)) + for interest in interests: + interest.tt_sendmsg_done(trace, time, core, id) + + patterns.append({ + 'name': 'sendmsg_done', + 'regexp': 'homa_sendmsg finished, id ([0-9]+)' + }) + + def __recvmsg_done(self, trace, time, core, match, interests): + status = int(match.group(1)) + id = int(match.group(2)) + for interest in interests: + interest.tt_recvmsg_done(trace, time, core, id, status) + + patterns.append({ + 'name': 'recvmsg_done', + 'regexp': 'homa_recvmsg returning status ([0-9]+), id ([0-9]+)' + }) + + def __copy_in_start(self, trace, time, core, match, interests): + for interest in interests: + interest.tt_copy_in_start(trace, time, core) + + patterns.append({ + 'name': 'copy_in_start', + 'regexp': 'starting copy from user space' + }) + + def __copy_in_done(self, trace, time, core, match, interests): + id = int(match.group(1)) + num_bytes = int(match.group(2)) + for interest in interests: + interest.tt_copy_in_done(trace, time, core, id, num_bytes) + + patterns.append({ + 'name': 'copy_in_done', + 'regexp': 'finished copy from user space for id ([-0-9.]+), ' + 'length ([-0-9.]+)' + }) + + def __copy_out_start(self, trace, time, core, match, interests): + id = int(match.group(1)) + for interest in interests: + interest.tt_copy_out_start(trace, time, core, id) + + patterns.append({ + 'name': 'copy_out_start', + 'regexp': 'starting copy to user space for id ([0-9]+)' + }) + + def __copy_out_done(self, trace, time, core, match, interests): + start = int(match.group(1)) + end = int(match.group(2)) + id = int(match.group(3)) + for interest in interests: + interest.tt_copy_out_done(trace, time, core, id, start, end) + + patterns.append({ + 'name': 'copy_out_done', + 'regexp': 'copied out bytes ([0-9.]+)-([0-9.]+) for id ([0-9.]+)' + }) + + def __free_skbs(self, trace, time, core, match, interests): + num_skbs = int(match.group(1)) + for interest in interests: + interest.tt_free_skbs(trace, time, core, num_skbs) + + patterns.append({ + 'name': 'free_skbs', + 'regexp': 'finished freeing ([0-9]+) skbs' + }) + + def __gro_handoff(self, trace, time, core, match, interests): + softirq_core = int(match.group(1)) + for interest in interests: + interest.tt_gro_handoff(trace, time, core, softirq_core) + + patterns.append({ + 'name': 'gro_handoff', + 'regexp': 'homa_gro_.* chose core ([0-9]+)' + }) + + def __softirq_start(self, trace, time, core, match, interests): + for interest in interests: + interest.tt_softirq_start(trace, time, core) + + patterns.append({ + 'name': 'softirq_start', + 'regexp': 'homa_softirq starting' + }) + + def __rpc_handoff(self, trace, time, core, match, interests): + id = int(match.group(1)) + for interest in interests: + interest.tt_rpc_handoff(trace, time, core, id) + + patterns.append({ + 'name': 'rpc_handoff', + 'regexp': 'homa_rpc_handoff handing off id ([0-9]+)' + }) + + def __rpc_queued(self, trace, time, core, match, interests): + id = int(match.group(1)) + for interest in interests: + interest.tt_rpc_queued(trace, time, core, id) + + patterns.append({ + 'name': 'rpc_queued', + 'regexp': 'homa_rpc_handoff queued id ([0-9]+)' + }) + + def __wait_found_rpc(self, trace, time, core, match, interests): + id = int(match.group(1)) + type = match.group(2) + blocked = int(match.group(3)) + for interest in interests: + interest.tt_wait_found_rpc(trace, time, core, id, type, blocked) + + patterns.append({ + 'name': 'wait_found_rpc', + 'regexp': 'homa_wait_[^ ]+ found rpc id ([0-9]+).* via ([a-z_]+), blocked ([0-9]+)' + }) + + def __poll_success(self, trace, time, core, match, interests): + id = int(match.group(1)) + for interest in interests: + interest.tt_poll_success(trace, time, core, id) + + patterns.append({ + 'name': 'poll_success', + 'regexp': 'received RPC handoff while polling, id ([0-9]+)' + }) + + def __resend_tx(self, trace, time, core, match, interests): + id = int(match.group(1)) + peer = match.group(2) + offset = int(match.group(3)) + length = int(match.group(4)) + for interest in interests: + interest.tt_resend_tx(trace, time, core, id, peer, offset, length) + + patterns.append({ + 'name': 'resend_tx', + 'regexp': 'Sending RESEND for id ([0-9]+), peer ([^,]+), ' + 'offset ([0-9]+), length ([-0-9]+)' + }) + + def __resend_rx(self, trace, time, core, match, interests): + id = int(match.group(1)) + offset = int(match.group(2)) + length = int(match.group(2)) + for interest in interests: + interest.tt_resend_rx(trace, time, core, id, offset, length) + + patterns.append({ + 'name': 'resend_rx', + 'regexp': 'resend request for id ([0-9]+), offset ([0-9]+), ' + 'length ([-0-9]+)' + }) + + def __retransmit(self, trace, time, core, match, interests): + offset = int(match.group(1)) + length = int(match.group(2)) + id = int(match.group(3)) + for interest in interests: + interest.tt_retransmit(trace, time, core, id, offset, length) + + patterns.append({ + 'name': 'retransmit', + 'regexp': 'retransmitting offset ([0-9]+), length ([0-9]+), id ([0-9]+)' + }) + + def __unsched(self, trace, time, core, match, interests): + id = int(match.group(1)) + num_bytes = int(match.group(2)) + for interest in interests: + interest.tt_unsched(trace, time, core, id, num_bytes) + + patterns.append({ + 'name': 'unsched', + 'regexp': 'Incoming message for id ([0-9]+) has ([0-9]+) unscheduled' + }) + + def __lock_wait(self, trace, time, core, match, interests): + event = match.group(1) + lock_name = match.group(2) + for interest in interests: + interest.tt_lock_wait(trace, time, core, event, lock_name) + + patterns.append({ + 'name': 'lock_wait', + 'regexp': '(beginning|ending) wait for (.*) lock' + }) + + def __resend_busy(self, trace, time, core, match, interests): + id = int(match.group(1)) + state = int(match.group(2)) + for interest in interests: + interest.tt_resend_busy(trace, time, core, id, state) + + patterns.append({ + 'name': 'resend_busy', + 'regexp': 'sending BUSY from resend, id ([0-9]+), state ([0-9]+)' + }) + + def __softirq_resend(self, trace, time, core, match, interests): + id = int(match.group(1)) + offset = int(match.group(2)) + length = int(match.group(3)) + prio = int(match.group(4)) + for interest in interests: + interest.tt_softirq_resend(trace, time, core, id, offset, + length, prio) + + patterns.append({ + 'name': 'softirq_resend', + 'regexp': 'resend request for id ([0-9]+), offset ([0-9]+), ' + 'length ([0-9]+), prio ([0-9]+)' + }) + + def __rpc_end(self, trace, time, core, match, interests): + id = int(match.group(1)) + for interest in interests: + interest.tt_rpc_end(trace, time, core, id) + + patterns.append({ + 'name': 'rpc_end', + 'regexp': 'homa_rpc_end invoked for id ([0-9]+)' + }) + + def __grant_check_start(self, trace, time, core, match, interests): + id = int(match.group(1)) + for interest in interests: + interest.tt_grant_check_start(trace, time, core, id) + + patterns.append({ + 'name': 'grant_check_start', + 'regexp': 'homa_grant_check_rpc starting for id ([0-9]+)' + }) + + def __grant_check_done(self, trace, time, core, match, interests): + id = int(match.group(1)) + for interest in interests: + interest.tt_grant_check_done(trace, time, core, id) + + patterns.append({ + 'name': 'grant_check_done', + 'regexp': 'homa_grant_check_rpc finished with id ([0-9]+)' + }) + + def __grant_check_lock_recalc(self, trace, time, core, match, interests): + id = int(match.group(1)) + for interest in interests: + interest.tt_grant_check_lock_recalc(trace, time, core, id) + + patterns.append({ + 'name': 'grant_check_lock_recalc', + 'regexp': 'homa_grant_check_rpc acquiring grant lock to fix order \(id ([0-9]+)\)' + }) + + def __grant_check_lock_needy(self, trace, time, core, match, interests): + rank = int(match.group(1)) + id = int(match.group(2)) + active = int(match.group(3)) + for interest in interests: + interest.tt_grant_check_lock_needy(trace, time, core, id, rank, active) + + patterns.append({ + 'name': 'grant_check_lock_needy', + 'regexp': 'homa_grant_check_rpc acquiring grant lock, needy_rank ' + '([0-9]+), id ([0-9]+), num_active ([0-9]+)' + }) + + def __grant_check_unlock(self, trace, time, core, match, interests): + id = int(match.group(1)) + for interest in interests: + interest.tt_grant_check_unlock(trace, time, core, id) + + patterns.append({ + 'name': 'grant_check_unlock', + 'regexp': 'homa_grant_check_rpc released grant lock \(id ([0-9]+)\)' + }) + + def __rpc_incoming(self, trace, time, core, match, interests): + id = int(match.group(1)) + peer = match.group(2) + received = int(match.group(3)) + length = int(match.group(4)) + for interest in interests: + interest.tt_rpc_incoming(trace, time, core, id, peer, received, length) + + patterns.append({ + 'name': 'rpc_incoming', + 'regexp': 'Incoming RPC id ([0-9]+), peer ([^,]+), ([0-9]+)/([0-9]+) bytes' + }) + + def __rpc_incoming2(self, trace, time, core, match, interests): + id = int(match.group(1)) + incoming = int(match.group(2)) + granted = int(match.group(3)) + for interest in interests: + interest.tt_rpc_incoming2(trace, time, core, id, incoming, granted) + + patterns.append({ + 'name': 'rpc_incoming2', + 'regexp': 'RPC id ([0-9]+) has incoming ([-0-9]+), granted ([0-9]+)' + }) + + def __rpc_incoming3(self, trace, time, core, match, interests): + id = int(match.group(1)) + length = int(match.group(2)) + remaining = int(match.group(3)) + rank = int(match.group(4)) + for interest in interests: + interest.tt_rpc_incoming3(trace, time, core, id, length, + remaining, rank) + + patterns.append({ + 'name': 'rpc_incoming3', + 'regexp': 'RPC id ([0-9]+): length ([0-9]+), remaining ([0-9]+), ' + 'rank ([-0-9]+)' + }) + + def __bpages_alloced(self, trace, time, core, match, interests): + id = int(match.group(1)) + bpages = int(match.group(2)) + for interest in interests: + interest.tt_bpages_alloced(trace, time, core, id, bpages) + + patterns.append({ + 'name': 'bpages_alloced', + 'regexp': 'RPC id ([0-9]+) has ([0-9]+) bpages allocated' + }) + + def __rpc_outgoing(self, trace, time, core, match, interests): + id = int(match.group(1)) + peer = match.group(2) + sent = int(match.group(3)) + length = int(match.group(4)) + for interest in interests: + interest.tt_rpc_outgoing(trace, time, core, id, peer, sent, length) + + patterns.append({ + 'name': 'rpc_outgoing', + 'regexp': 'Outgoing RPC id ([0-9]+), peer ([^,]+), ' + '([0-9]+)/([0-9]+) bytes' + }) + + def __discard_unknown(self, trace, time, core, match, interests): + id = int(match.group(1)) + for interest in interests: + interest.tt_discard_unknown(trace, time, core, id) + + patterns.append({ + 'name': 'discard_unknown', + 'regexp': 'Discarding packet for unknown RPC, id ([0-9]+),' + }) + + def __pacer_xmit(self, trace, time, core, match, interests): + id = int(match.group(1)) + port = int(match.group(2)) + offset = int(match.group(3)) + bytes_left = int(match.group(4)) + for interest in interests: + interest.tt_pacer_xmit(trace, time, core, id, offset, port, + bytes_left) + + patterns.append({ + 'name': 'pacer_xmit', + 'regexp': 'pacer calling homa_xmit_data for rpc id ([0-9]+), port ' + '([0-9]+), offset ([0-9]+), bytes_left ([0-9]+)' + }) + + def __qdisc_defer(self, trace, time, core, match, interests): + id = int(match.group(1)) + offset = int(match.group(2)) + for interest in interests: + interest.tt_qdisc_defer(trace, time, core, id, offset) + + patterns.append({ + 'name': 'qdisc_defer', + 'regexp': 'homa_qdisc_enqueue deferring homa data packet for ' + 'id ([0-9]+), offset ([0-9]+)' + }) + + def __qdisc_xmit(self, trace, time, core, match, interests): + id = int(match.group(1)) + offset = int(match.group(2)) + for interest in interests: + interest.tt_qdisc_xmit(trace, time, core, id, offset) + + patterns.append({ + 'name': 'qdisc_xmit', + 'regexp': 'homa_qdisc_pacer queuing homa data packet for id ([0-9]+), ' + 'offset ([-0-9]+)' + }) + + def __snapshot_clock(self, trace, time, core, match, interests): + usecs = int(match.group(1)) + for interest in interests: + interest.tt_snapshot_clock(trace, time, core, usecs) + + patterns.append({ + 'name': 'snapshot_clock', + 'regexp': 'rpc snapshot usecs ([0-9-]+)' + }) + + + def __snapshot_client_request(self, trace, time, core, match, interests): + msgs_started = int(match.group(1)) + bytes_started = int(match.group(2)) + bytes_done = int(match.group(3)) + msgs_done = int(match.group(4)) + for interest in interests: + interest.tt_snapshot_client_request(trace, time, core, msgs_started, + bytes_started, bytes_done, msgs_done) + + patterns.append({ + 'name': 'snapshot_client_request', + 'regexp': 'rpc snapshot client requests started ([0-9]+), ' + 'kbytes_started ([0-9]+), kbytes_done ([0-9]+), done ([0-9]+)' + }) + + def __snapshot_client_response(self, trace, time, core, match, interests): + msgs_started = int(match.group(1)) + bytes_started = int(match.group(2)) + bytes_done = int(match.group(3)) + msgs_done = int(match.group(4)) + for interest in interests: + interest.tt_snapshot_client_response(trace, time, core, msgs_started, + bytes_started, bytes_done, msgs_done) + + patterns.append({ + 'name': 'snapshot_client_response', + 'regexp': 'rpc snapshot client responses started ([0-9]+), ' + 'kbytes_started ([0-9]+), kbytes_done ([0-9]+), done ([0-9]+)' + }) + + def __snapshot_server_request(self, trace, time, core, match, interests): + msgs_started = int(match.group(1)) + bytes_started = int(match.group(2)) + bytes_done = int(match.group(3)) + msgs_done = int(match.group(4)) + for interest in interests: + interest.tt_snapshot_server_request(trace, time, core, msgs_started, + bytes_started, bytes_done, msgs_done) + + patterns.append({ + 'name': 'snapshot_server_request', + 'regexp': 'rpc snapshot server requests started ([0-9]+), ' + 'kbytes_started ([0-9]+), kbytes_done ([0-9]+), done ([0-9]+)' + }) + + def __snapshot_server_response(self, trace, time, core, match, interests): + msgs_started = int(match.group(1)) + bytes_started = int(match.group(2)) + bytes_done = int(match.group(3)) + msgs_done = int(match.group(4)) + for interest in interests: + interest.tt_snapshot_server_response(trace, time, core, msgs_started, + bytes_started, bytes_done, msgs_done) + + patterns.append({ + 'name': 'snapshot_server_response', + 'regexp': 'rpc snapshot server responses started ([0-9]+), ' + 'kbytes_started ([0-9]+), kbytes_done ([0-9]+), done ([0-9]+)' + }) + + def __tcp_sendmsg2(self, trace, time, core, match, interests): + saved = self.core_saved[core] + saved['sendmsg_slot'] = int(match.group(1)) + saved['sendmsg_response'] = int(match.group(2)) + + patterns.append({ + 'name': 'tcp_sendmsg2', + 'regexp': 'tcp_sendmsg new message slot is ([0-9]+), response ([0-9]+)' + }) + + def __tcp_sendmsg(self, trace, time, core, match, interests): + saved = self.core_saved[core] + if not 'sendmsg_slot' in saved: + return + source = match.group(1) + dest = match.group(2) + msg_length = int(match.group(3)) + sequence = int(match.group(4)) + for interest in interests: + interest.tt_tcp_sendmsg(trace, time, core, source, dest, msg_length, + sequence, saved['sendmsg_slot'], saved['sendmsg_response']) + + patterns.append({ + 'name': 'tcp_sendmsg', + 'regexp': 'tcp_sendmsg invoked for message from (0x[a-f0-9]+) to ' + '(0x[a-f0-9]+), length ([0-9]+), starting sequence ([0-9]+)' + }) + + def __tcp_xmit(self, trace, time, core, match, interests): + source = match.group(1) + dest = match.group(2) + data_bytes = int(match.group(3)) + seq_ack = int(match.group(4)) + for interest in interests: + interest.tt_tcp_xmit(trace, time, core, source, dest, data_bytes, + seq_ack) + + patterns.append({ + 'name': 'tcp_xmit', + 'regexp': 'Transmitting TCP packet from (0x[a-f0-9]+) to ' + '(0x[a-f0-9]+), data bytes ([0-9]+), seq/ack ([0-9]+)' + }) + + def __tcp_qdisc(self, trace, time, core, match, interests): + source = match.group(1) + dest = match.group(2) + data_bytes = int(match.group(3)) + seq_ack = int(match.group(4)) + for interest in interests: + interest.tt_tcp_qdisc(trace, time, core, source, dest, data_bytes, + seq_ack) + + patterns.append({ + 'name': 'tcp_qdisc', + 'regexp': 'homa_qdisc_pacer requeued TCP packet from (0x[a-f0-9]+) to ' + '(0x[a-f0-9]+), data bytes ([0-9]+), seq/ack ([0-9]+)' + }) + + def __tcp_nic(self, trace, time, core, match, interests): + source = match.group(2) + dest = match.group(3) + data_bytes = int(match.group(4)) + seq_ack = int(match.group(5)) + for interest in interests: + interest.tt_tcp_nic(trace, time, core, source, dest, data_bytes, + seq_ack) + + patterns.append({ + 'name': 'tcp_nic', + 'regexp': 'sent TCP packet via (mlx|ice) from (0x[a-f0-9]+) to ' + '(0x[a-f0-9]+), data bytes ([0-9]+), seq/ack ([0-9]+)' + }) + + def __tcp_free2(self, trace, time, core, match, interests): + self.core_saved[core]['tcp_free_qid'] = int(match.group(1)) + + patterns.append({ + 'name': 'tcp_free2', + 'regexp': 'freeing TCP skb for qid ([0-9]+)' + }) + + def __tcp_free(self, trace, time, core, match, interests): + saved = self.core_saved[core] + if not 'tcp_free_qid' in saved: + return + source = match.group(1) + dest = match.group(2) + data_bytes = int(match.group(3)) + seq_ack = int(match.group(4)) + for interest in interests: + interest.tt_tcp_free(trace, time, core, source, dest, data_bytes, + seq_ack, saved['tcp_free_qid']) + + patterns.append({ + 'name': 'tcp_free', + 'regexp': 'freeing TCP skb from (0x[a-f0-9]+) to ' + '(0x[a-f0-9]+), data bytes ([0-9]+), seq/ack ([0-9]+)' + }) + + def __tcp_gro(self, trace, time, core, match, interests): + source = match.group(1) + dest = match.group(2) + data_bytes = int(match.group(3)) + seq_ack = int(match.group(4)) + for interest in interests: + interest.tt_tcp_gro(trace, time, core, source, dest, data_bytes, + seq_ack) + patterns.append({ + 'name': 'tcp_gro', + 'regexp': 'tcp_gro_receive got packet from (0x[a-f0-9]+) to ' + '(0x[a-f0-9]+), data bytes ([0-9]+), seq/ack ([0-9]+)' + }) + + def __tcp_softirq(self, trace, time, core, match, interests): + source = match.group(1) + dest = match.group(2) + data_bytes = int(match.group(3)) + seq_ack = int(match.group(4)) + for interest in interests: + interest.tt_tcp_softirq(trace, time, core, source, dest, data_bytes, + seq_ack) + patterns.append({ + 'name': 'tcp_softirq', + 'regexp': 'softirq got TCP packet from (0x[a-f0-9]+) to ' + '(0x[a-f0-9]+), data bytes ([0-9]+), seq/ack ([0-9]+)' + }) + + def __tcp_recvmsg(self, trace, time, core, match, interests): + source = match.group(1) + dest = match.group(2) + msg_length = int(match.group(3)) + sequence = int(match.group(4)) + for interest in interests: + interest.tt_tcp_recvmsg(trace, time, core, source, dest, msg_length, + sequence) + + patterns.append({ + 'name': 'tcp_recvmsg', + 'regexp': 'tcp_recvmsg returning message from (0x[a-f0-9]+) to ' + '(0x[a-f0-9]+), length ([0-9]+), ending sequence ([0-9]+)' + }) + + def __txq_stop(self, trace, time, core, match, interests): + queue = match.group(1) + limit = int(match.group(2)) + queued = int(match.group(3)) + for interest in interests: + interest.tt_txq_stop(trace, time, core, queue, limit, queued) + + patterns.append({ + 'name': 'txq_stop', + 'regexp': r'netdev_tx_sent_queue stopped queue (0x[a-f0-9]+): limit ' + '([0-9]+), queued ([0-9]+)' + }) + + def __txq_restart(self, trace, time, core, match, interests): + queue = match.group(1) + for interest in interests: + interest.tt_txq_restart(trace, time, core, queue) + + patterns.append({ + 'name': 'txq_restart', + 'regexp': r'netdev_tx_completed_queue restarted queue (0x[a-f0-9]+)' + }) + +#------------------------------------------------ +# Analyzer: activity +#------------------------------------------------ +class AnalyzeActivity: + """ + Prints statistics about how many RPCs are live and data throughput. + If --data is specified, generates activity_.data files that + describe activity over small intervals across the traces. The information + in the .dat files includes new and total incoming messages, messages + with grants, KB of outstanding grants, incoming data, and new and total + outgoing messages. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeGrants') + dispatcher.interest('AnalyzeIntervals') + + def analyze(self): + global rpcs, packets, traces + + # Each of the following lists contains entries, + # where event is 'start' or end'. The entry indicates that an + # input or output message started arriving or completed at the given time. + + # Node name -> list of events for input messages on that server. + self.node_in_msgs = {} + + # Node name -> count of new incoming messages that started during + # the trace. + self.node_in_starts = defaultdict(lambda: 0) + + # Node name -> list of events for output messages on that server. + self.node_out_msgs = {} + + # Node name -> count of new outgoing messages that started during + # the trace. + self.node_out_starts = defaultdict(lambda: 0) + + # Node name -> dictionary that maps from core number to total GRO data + # received by that core + self.node_core_in_bytes = {} + + # Node name -> total bytes output by that node + self.node_out_bytes = {} + + # Node name -> dictionary that maps from core number to total RPCs + # that used the given core for GRO. + self.node_core_rpcs = {} + + # Node name -> dictionary that maps from core number to total RPCs + # still in incoming state at the end of the trace that use the given + # core for GRO. + self.node_core_pending = {} + + for node in get_sorted_nodes(): + self.node_in_msgs[node] = [] + self.node_out_msgs[node] = [] + self.node_core_in_bytes[node] = {} + self.node_out_bytes[node] = 0 + self.node_core_rpcs[node] = defaultdict(lambda: 0) + self.node_core_pending[node] = defaultdict(lambda: 0) + + # Scan RPCs to collect data + for id, rpc in rpcs.items(): + node = rpc['node'] + + if 'gro_core' in rpc: + core = rpc['gro_core'] + self.node_core_rpcs[node][core] += 1 + if 'remaining' in rpc: + self.node_core_pending[node][core] += 1 + + if 'rx_live' in rpc: + in_start, in_end = rpc['rx_live'] + self.node_in_msgs[node].append([in_start, 'start']) + self.node_in_msgs[node].append([in_end, 'end']) + + for time, offset, priority in rpc['gro_data']: + if offset == 0: + self.node_in_starts[node] += 1 + break + + if 'tx_live' in rpc: + out_start, out_end = rpc['tx_live'] + self.node_out_msgs[node].append([out_start, 'start']) + self.node_out_msgs[node].append([out_end, 'end']) + + if 'sendmsg' in rpc: + self.node_out_starts[node] += 1 + + sender_id = id^1 + if sender_id in rpcs: + sender = rpcs[sender_id] + else: + sender = None + for time, offset, prio in rpc['gro_data']: + xmit = None + if sender != None: + xmit = get_xmit_time(offset, sender) + if (xmit == None) or (xmit > time): + xmit = time + length = get_recv_length(offset, rpc['in_length']) + if xmit > time: + print('\nNegative transmit time for offset %d in id %d: %s' % + (offset, id, rpc)) + print('\nSending RPC: %s' % (sender)) + + cores = self.node_core_in_bytes[node] + core = rpc['gro_core'] + if not core in cores: + cores[core] = length + else: + cores[core] += length + + for time, offset, length in rpc['send_data']: + self.node_out_bytes[node] += length + + def print_rates(self): + """ + Print summary information about packet and data rates for both Homa + and TCP. + """ + global packets, grants, tcp_packets + + # node -> dictionary of information about that node: + # homa_pkts: Total Homa data packets sent by the node + # homa_bytes: Total Homa message bytes sent by the node + # homa_grants: Homa grants sent by the node + # tcp_pkts: TCP packets with data sent by the node + # tcp_bytes: Total data sent in TCP packets + # tcp_acks: TCP packets with no data + nodes = defaultdict(lambda : defaultdict(lambda: 0)) + + for pkt in packets.values(): + if not pkt['tx_node'] or not 'tso_length' in pkt: + continue + node_stats = nodes[pkt['tx_node']] + node_stats['homa_pkts'] += 1 + node_stats['homa_bytes'] += pkt['tso_length'] + + for pkt in grants.values(): + if not pkt['tx_node']: + continue + node_stats = nodes[pkt['tx_node']] + node_stats['homa_grants'] += 1 + + for pkt in tcp_packets.values(): + if pkt['tx_node']: + continue + node_stats = nodes[pkt['tx_node']] + if not 'tso_length' in pkt: + continue + length = pkt['tso_length'] + if length > 0: + node_stats['tcp_pkts'] += 1 + node_stats['tcp_bytes'] += length + else: + node_stats['tcp_acks'] += 1 + + print('Summary of packet and data rates for Homa and TCP for each node:') + print('HomaGbps: Rate of outgoing Homa message data from node ' + '(Gbps)') + print('HomaPkts: Rate of outgoing Homa data packets from node ' + '(K pkts/sec)') + print('HomaGrants: Rate of outgoing Homa grant packets from node ' + '(K pkts/sec)') + print('TcpGbps: Rate of outgoing TCP data from node (Gbps)') + print('TcpPkts: Rate of outgoing TCP packets with data from node ' + '(K pkts/sec)') + print('TcpAcks: Rate of outgoing TCP ack packets from node ' + '(K pkts/sec)') + print() + print('Node HomaGbps HomaPkts HomaGrants TcpGbps TcpPkts TcpAcks') + print('-----------------------------------------------------------------') + for node in get_sorted_nodes(): + node_stats = nodes[node] + usecs = traces[node]['last_time'] - traces[node]['first_time'] + print('%-10s %7.2f %7.1f %7.1f %7.2f %7.1f %7.1f' % ( + node, node_stats['homa_bytes'] * 8e-3 / usecs, + node_stats['homa_pkts'] * 1e3 / usecs, + node_stats['homa_grants'] * 1e3 / usecs, + node_stats['tcp_bytes'] * 8e-3 / usecs, + node_stats['tcp_pkts'] * 1e3 / usecs, + node_stats['tcp_acks'] * 1e3 / usecs + )) + + def sum_list(self, events): + """ + Given a list of entries where event is 'start' or 'end', + return a list : + num_starts: Total number of 'start' events + live_frac: Fraction of all time when #starts > #ends + avg_live: Average value of #starts - #ends + The input list should be sorted in order of time by the caller. + """ + num_starts = 0 + cur_live = 0 + live_time = 0 + live_integral = 0 + + if not events: + return [0, 0, 0] + last_time = events[0][0] + + for time, event in events: + # print("%9.3f: %s, cur_live %d, live_time %.1f, live_integral %.1f" % + # (time, event, cur_live, live_time, live_integral)) + delta = time - last_time + if cur_live: + live_time += delta + live_integral += delta * cur_live + if event == 'start': + num_starts += 1 + cur_live += 1 + else: + cur_live -= 1 + last_time = time + total_time = events[-1][0] - events[0][0] + return num_starts, live_time/total_time, live_integral/total_time + + def output(self): + global rpcs, traces + + print('\n-------------------') + print('Analyzer: activity') + print('-------------------\n') + print('Msgs: Total number of incoming/outgoing messages that were') + print(' live at some point during the traces') + print('MsgRate: Rate at which new messages were initiated (K/sec)') + print('LiveFrac: Fraction of time when at least one message was live') + print('AvgLive: Average number of live messages') + print('Gbps: Total message throughput (Gbps)') + print('LiveGbps: Total throughput when at least one message was live (Gbps)') + print('MaxCore: Highest incoming throughput via a single GRO core (Gbps)') + print('MaxFrac: Highest fraction of all RPCs serviced by a ' + 'single GRO core') + print('MaxPFrac: Highest fraction of partially received RPCs (at ' + 'end of trace)') + print(' handled by a single GRO core\n') + print('Incoming messages:') + print('Node Msgs MsgRate LiveFrac AvgLive Gbps LiveGbps' + ' MaxCore MaxFrac MaxPFrac') + print('-------------------------------------------------------------' + '--------------------------------------') + for node in get_sorted_nodes(): + if not node in self.node_in_msgs: + continue + events = sorted(self.node_in_msgs[node], key=lambda t : t[0]) + max_core = 0 + max_bytes = 0 + total_bytes = 0 + for core, bytes in self.node_core_in_bytes[node].items(): + total_bytes += bytes + if bytes > max_bytes: + max_bytes = bytes + max_core = core + max_gbps = max_bytes*8e-3/(traces[node]['elapsed_time']) + max_pending = -1 + max_pending_core = -1 + total_pending = 0 + for core, pending in self.node_core_pending[node].items(): + total_pending += pending + if pending > max_pending: + max_pending = pending + max_pending_core = core + max_rpcs = -1 + max_rpcs_core = -1 + total_rpcs = 0 + for core, rpcs in self.node_core_rpcs[node].items(): + total_rpcs += rpcs + if rpcs > max_rpcs: + max_rpcs = rpcs + max_rpcs_core = core + + elapsed = traces[node]['elapsed_time'] + msgs, liveFrac, avgLive = self.sum_list(events) + rate = 1e3 * self.node_in_starts[node] / elapsed + avg_gbps = total_bytes*8e-3 / elapsed + print('%-10s %6d %7.2f %9.3f %8.2f %7.2f %7.2f' % ( + node, msgs, rate, liveFrac, avgLive, avg_gbps, + avg_gbps/liveFrac if liveFrac != 0 else 0, + ), end='') + print(' %5.2f (C%02d) %6.3f (C%02d) %6.3f (C%02d)' % ( + max_gbps, max_core, div_safe(max_rpcs, total_rpcs), + max_rpcs_core, div_safe(max_pending, total_pending), + max_pending_core)) + print('\nOutgoing messages:') + print('Node Msgs MsgRate LiveFrac AvgLive Gbps LiveGbps') + print('-------------------------------------------------------------') + for node in get_sorted_nodes(): + if not node in self.node_out_msgs: + continue + bytes = self.node_out_bytes[node] + elapsed = traces[node]['elapsed_time'] + events = sorted(self.node_out_msgs[node]) + msgs, liveFrac, avgLive = self.sum_list(events) + rate = 1e3 * self.node_out_starts[node] / elapsed + avg_gbps = bytes*8e-3 / elapsed + print('%-10s %6d %7.2f %9.3f %8.2f %7.2f %7.2f' % ( + node, msgs, rate, liveFrac, avgLive, avg_gbps, + avg_gbps/liveFrac if liveFrac != 0 else 0)) + + print() + self.print_rates() + + if options.data: + for node in get_sorted_nodes(): + f = open('%s/activity_%s.dat' % (options.data, node), 'w') + f.write('# Node: %s\n' % (node)) + f.write('# Generated at %s.\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + f.write('# Statistics about RPC and packet activity on the ') + f.write('node over %d usec\n' % (options.interval)) + f.write('# intervals:\n') + f.write('# Time: End of the time interval\n') + f.write('# NewRx: New incoming messages that started during ' + 'the interval\n') + f.write('# NumRx: Incoming messages that were partially ' + 'received at the\n') + f.write('# end of the interval\n') + f.write('# RxGts: Number of incoming RPCS with outstanding grants at the\n') + f.write('# end of the interval (doesn\'t include unscheduled)\n') + f.write('# RxGtKB: Number of KB for which grants have been sent but data\n') + f.write('# not yet received at the end of the interval\n') + f.write('# RxPkts: Number of data packets received during the interval\n') + f.write('# RxGbps: Throughput of received data during the interval\n') + f.write('# Incoming: KB of data that had been transmitted but not yet\n') + f.write('# received, as of the end of the interval\n') + f.write('# NewTx: New outgoing messages that started during ' + 'the interval\n') + f.write('# NumTx: Outgoing messages that were partially ' + 'transmitted at the\n') + f.write('# end of the interval\n') + f.write('\n') + f.write('# Time NewRx NumRx RxGts RxGtKB RxPkts RxGbps Incoming NewTx NumTx\n') + for interval in intervals[node]: + f.write('%8.1f' % (interval['time'])) + if 'rx_starts' in interval: + f.write(' %5d %5d' % (interval['rx_starts'], + interval['rx_live'])) + else: + f.write(' '*12) + if 'rx_grants' in interval: + f.write(' %5d %6.0f' % (interval['rx_grants'], + interval['rx_grant_bytes']/1000.0)) + else: + f.write(' ' *13) + if 'rx_pkts' in interval: + f.write(' %6d %6.1f %6.1f' % (interval['rx_pkts'], + gbps(interval['rx_bytes'], options.interval), + (interval['rx_data_qdisc'] + + interval['rx_data_net'])*1e-3)) + else: + f.write(' '*12) + if 'tx_starts' in interval: + f.write(' %5d %5d' % (interval['tx_starts'], + interval['tx_live_req'] + + interval['tx_live_resp'])) + else: + f.write(' '*12) + f.write('\n') + f.close() + +#------------------------------------------------ +# Analyzer: bpages +#------------------------------------------------ +class AnalyzeBpages: + """ + Output information about bpage usage on each of the nodes, as of the + end of the traces. This information is only available if + homa_rpc_log_active_tt was invoked before freezing the time traces. + """ + def __init__(self, dispatcher): + self.node_rpcs = defaultdict(lambda : 0) + self.node_bpages = defaultdict(lambda: 0) + + def tt_bpages_alloced(self, trace, time, core, id, bpages): + node = trace['node'] + self.node_rpcs[node] += 1 + self.node_bpages[node] += bpages + + def output(self): + global traces, options + print('\n-------------------') + print('Analyzer: bpages') + print('-------------------') + print('Bpage usage at the end of the traces') + print('Node: Name of a node') + print('RPCs: Number of RPCs on that node with bpages allocated') + print('Bpages: Total bpages (or partial bpages) allocated on the node') + print('') + print('Node RPCs Bpages') + for node in get_sorted_nodes(): + print('%-10s %5d %6d' % (node, self.node_rpcs[node], + self.node_bpages[node])) + +#------------------------------------------------ +# Analyzer: copy +#------------------------------------------------ +class AnalyzeCopy: + """ + Measures the throughput of copies between user space and kernel space. + """ + + def __init__(self, dispatcher): + return + + def init_trace(self, trace): + trace['copy'] = { + # Keys are cores; values are times when most recent copy from + # user space started on that core + 'in_start': {}, + + # Total bytes of data copied from user space for large messages + 'large_in_data': 0, + + # Total microseconds spent copying data for large messages + 'large_in_time': 0.0, + + # Total number of large messages copied into kernel + 'large_in_count': 0, + + # List of copy times for messages no larger than 1200 B + 'small_in_times': [], + + # Total time spent copying in data for all messages + 'total_in_time': 0.0, + + # Keys are cores; values are times when most recent copy to + # user space started on that core + 'out_start': {}, + + # Keys are cores; values are times when most recent copy to + # user space ended on that core + 'out_end': {}, + + # Keys are cores; values are sizes of last copy to user space + 'out_size': {}, + + # Total bytes of data copied to user space for large messages + 'large_out_data': 0, + + # Total microseconds spent copying data for large messages + 'large_out_time': 0.0, + + # Total microseconds spent copying data for large messages, + # including time spent freeing skbs. + 'large_out_time_with_skbs': 0.0, + + # Total number of large messages copied out of kernel + 'large_out_count': 0, + + # List of copy times for messages no larger than 1200 B + 'small_out_times': [], + + # Total time spent copying out data for all messages + 'total_out_time': 0.0, + + # Total number of skbs freed after copying data to user space + 'skbs_freed': 0, + + # Total time spent freeing skbs after copying data + 'skb_free_time': 0.0 + } + + def tt_copy_in_start(self, trace, time, core): + stats = trace['copy'] + stats['in_start'][core] = time + + def tt_copy_in_done(self, trace, time, core, id, num_bytes): + global options + stats = trace['copy'] + if core in stats['in_start']: + delta = time - stats['in_start'][core] + stats['total_in_time'] += delta + if num_bytes <= 1000: + stats['small_in_times'].append(delta) + elif num_bytes >= 5000: + stats['large_in_data'] += num_bytes + stats['large_in_time'] += delta + stats['large_in_count'] += 1 + if 0 and options.verbose: + print('%9.3f Copy in finished [C%02d]: %d bytes, %.1f us, %5.1f Gbps' % + (time, core, num_bytes, delta, 8e-03*num_bytes/delta)) + + def tt_copy_out_start(self, trace, time, core, id): + stats = trace['copy'] + stats['out_start'][core] = time + + def tt_copy_out_done(self, trace, time, core, id, start, end): + global options + stats = trace['copy'] + num_bytes = end - start + if core in stats['out_start']: + stats['out_end'][core] = time + stats['out_size'][core] = num_bytes + delta = time - stats['out_start'][core] + stats['out_start'][core] = time + stats['total_out_time'] += delta + if num_bytes <= 1000: + stats['small_out_times'].append(delta) + elif num_bytes >= 5000: + stats['large_out_data'] += num_bytes + stats['large_out_time'] += delta + stats['large_out_time_with_skbs'] += delta + stats['large_out_count'] += 1 + if 0 and options.verbose: + print('%9.3f Copy out finished [C%02d]: %d bytes, %.1f us, %5.1f Gbps' % + (time, core, num_bytes, delta, 8e-03*num_bytes/delta)) + + def tt_free_skbs(self, trace, time, core, num_skbs): + stats = trace['copy'] + if core in stats['out_end']: + delta = time - stats['out_end'][core] + stats['skbs_freed'] += num_skbs + stats['skb_free_time'] += delta + if stats['out_size'][core] >= 5000: + stats['large_out_time_with_skbs'] += delta + + def output(self): + global traces + print('\n---------------') + print('Analyzer: copy') + print('---------------') + print('Performance of data copying between user space and kernel:') + print('Node: Name of node') + print('#Short: Number of short blocks copied (<= 1000 B)') + print('Min: Minimum copy time for a short block (usec)') + print('P50: Median copy time for short blocks (usec)') + print('P90: 90th percentile copy time for short blocks (usec)') + print('P99: 99th percentile copy time for short blocks (usec)') + print('Max: Maximum copy time for a short block (usec)') + print('Avg: Average copy time for short blocks (usec)') + print('#Long: Number of long blocks copied (>= 5000 B)') + print('TputC: Average per-core throughput for copying long blocks') + print(' when actively copying (Gbps)') + print('TputN: Average long block copy throughput for the node (Gbps)') + print('Cores: Average number of cores copying long blocks') + print('') + print('Copying from user space to kernel:') + print('Node #Short Min P50 P90 P99 Max Avg #Long ' + 'TputC TputN Cores') + print('--------------------------------------------------------------' + '-----------------') + for node in get_sorted_nodes(): + trace = traces[node] + stats = trace['copy'] + + num_short = len(stats['small_in_times']) + if num_short == 0: + min = p50 = p90 = p99 = max = avg = 0.0 + else: + sorted_data = sorted(stats['small_in_times']) + min = sorted_data[0] + p50 = sorted_data[50*num_short//100] + p90 = sorted_data[90*num_short//100] + p99 = sorted_data[99*num_short//100] + max = sorted_data[-1] + avg = sum(sorted_data)/num_short + + num_long = stats['large_in_count'] + if stats['large_in_time'] == 0: + core_tput = ' N/A' + node_tput = ' N/A' + cores = 0 + else: + core_tput = '%6.1f' % (8e-03*stats['large_in_data'] + /stats['large_in_time']) + node_tput = '%6.1f' % (8e-03*stats['large_in_data'] + /trace['elapsed_time']) + cores = stats['total_in_time']/trace['elapsed_time'] + print('%-10s %6d%6.1f%6.1f%6.1f%6.1f%6.1f%6.1f %5d %s%s %5.2f' % + (node, num_short, min, p50, p90, p99, max, avg, num_long, + core_tput, node_tput, cores)) + + print('\nCopying from kernel space to user:') + print('Node #Short Min P50 P90 P99 Max Avg #Long ' + 'TputC TputN Cores') + print('--------------------------------------------------------------' + '-----------------') + for node in get_sorted_nodes(): + trace = traces[node] + stats = trace['copy'] + + num_short = len(stats['small_out_times']) + if num_short == 0: + min = p50 = p90 = p99 = max = avg = 0.0 + else: + sorted_data = sorted(stats['small_out_times']) + min = sorted_data[0] + p50 = sorted_data[50*num_short//100] + p90 = sorted_data[90*num_short//100] + p99 = sorted_data[99*num_short//100] + max = sorted_data[-1] + avg = sum(sorted_data)/num_short + + num_long = stats['large_out_count'] + if stats['large_out_time'] == 0: + core_tput = ' N/A' + node_tput = ' N/A' + cores = 0 + else: + core_tput = '%6.1f' % (8e-03*stats['large_out_data'] + /stats['large_out_time']) + node_tput = '%6.1f' % (8e-03*stats['large_out_data'] + /trace['elapsed_time']) + cores = stats['total_out_time']/trace['elapsed_time'] + print('%-10s %6d%6.1f%6.1f%6.1f%6.1f%6.1f%6.1f %5d %s%s %5.2f' % + (node, num_short, min, p50, p90, p99, max, avg, num_long, + core_tput, node_tput, cores)) + + print('\nImpact of freeing socket buffers while copying to user:') + print('Node: Name of node') + print('#Freed: Number of skbs freed') + print('Time: Average time to free an skb (usec)') + print('Tput: Effective kernel->user throughput per core (TputC) including') + print(' skb freeing (Gbps)') + print('') + print('Node #Freed Time Tput') + print('-------------------------------') + for node in get_sorted_nodes(): + trace = traces[node] + stats = trace['copy'] + stats['skbs_freed'] + if stats['skbs_freed'] == 0: + free_time = 0 + tput = 0 + else: + free_time = stats['skb_free_time']/stats['skbs_freed'] + if stats['large_out_time_with_skbs']: + tput = '%6.1f' % (8e-03*stats['large_out_data'] + /stats['large_out_time_with_skbs']) + else: + tput = ' N/A' + print('%-10s %6d %6.2f %s' % (node, stats['skbs_freed'], + free_time, tput)) + +#------------------------------------------------ +# Analyzer: core +#------------------------------------------------ +class AnalyzeCore: + """ + Generates statistics about activity on a single core. Requires the --node, + --core, and --data options. + """ + + def __init__(self, dispatcher): + require_options('core', 'data', 'node') + + # List of all intervals over the life of the trace, each list entry + # is a dictionary with the following values related to that interval: + # time: Ending time of the interval + # gro_data: Number of incoming data packets processed by GRO + # gro_grant: Number of incoming grant packets processed by GRO + # softirq_data: Number of incoming data packets processed by SoftIRQ + # softirq_grant: Number of incoming grant packets processed by SoftIRQ + # resends: Number of incoming resend requests processed by SoftIRQ + # busy: Number of BUSY packets sent + # grant_sends: Number of GRANT packets sent + self.intervals = [] + + def init_trace(self, trace): + # Target core id -> list of times when gro chose that core but + # SoftIRQ hasn't yet woken up + self.gro_handoffs = defaultdict(list) + + def get_interval(self, t): + """ + Find the interval corresponding to time t, initializing new intervals + when needed. + """ + global options + + interval_length = options.interval + while True: + if len(self.intervals) == 0: + end = int(t)//interval_length * interval_length + self.intervals.append({'time': end}) + interval = self.intervals[0] + else: + first_end = self.intervals[0]['time'] + i = int((t - (first_end - interval_length))//interval_length) + if i < 0: + raise Exception('Unexpected index %d (time %9.1f) in ' + 'AnalyzeCore.get_interval' % (i, t)) + if i < len(self.intervals): + return self.intervals[i] + self.intervals.append({'time': first_end + + interval_length*len(self.intervals)}) + interval = self.intervals[-1] + interval['gro_data']= 0 + interval['gro_grant']= 0 + interval['softirq_data'] = 0 + interval['softirq_grant'] = 0 + interval['resends'] = 0 + interval['busy'] = 0 + interval['grant_sends'] = 0 + + def inc_counter(self, trace, time, core, name): + """ + Does most of the work of the tt_* methods below: increment the + counter given by name if the record is for the right core. + """ + global options + if (core != options.core) or (trace['node'] != options.node): + return + self.get_interval(time)[name] += 1 + + def tt_gro_data(self, trace, time, core, peer, id, offset, prio): + self.inc_counter(trace, time, core, 'gro_data') + + def tt_gro_grant(self, trace, time, core, peer, id, offset, prio): + self.inc_counter(trace, time, core, 'gro_grant') + + def tt_softirq_data(self, trace, time, core, id, offset, msg_length): + self.inc_counter(trace, time, core, 'softirq_data') + + def tt_softirq_grant(self, trace, time, core, id, offset, priority, + increment): + self.inc_counter(trace, time, core, 'softirq_grant') + + def tt_softirq_resend(self, trace, time, core, id, offset, length, prio): + self.inc_counter(trace, time, core, 'resends') + + def tt_resend_busy(self, trace, time, core, id, state): + self.inc_counter(trace, time, core, 'busy') + + def tt_send_grant(self, trace, time, core, id, offset, priority, increment): + self.inc_counter(trace, time, core, 'grant_sends') + + def output(self): + global options + + print('\n-------------------') + print('Analyzer: core') + print('-------------------') + + if not self.intervals: + print('No data found for core %d on %s' % (options.core, + options.node)) + return + + print('\nOverall statistics:') + print(' Total Avg/Interval') + l = len(self.intervals) + total = sum_fields(self.intervals, 'softirq_data') + print('Data packets processed by SoftIRQ: %6d %6.1f' % + (total, total/l)) + total = sum_fields(self.intervals, 'softirq_grant') + print('Grants processed by SoftIRQ: %6d %6.1f' % + (total, total/l)) + total = sum_fields(self.intervals, 'resends') + print('Resends processed by SoftIRQ: %6d %6.1f' % + (total, total/l)) + total = sum_fields(self.intervals, 'busy') + print('BUSY packets sent: %6d %6.1f' % + (total, total/l)) + total = sum_fields(self.intervals, 'grant_sends') + print('GRANT packets sent: %6d %6.1f' % + (total, total/l)) + + f = open('%s/core_%s-%d.dat' % (options.data, options.node, + options.core), 'w') + f.write('# Node: %s\n' % (options.node)) + f.write('# Core: %d\n' % (options.core)) + f.write('# Generated at %s.\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + f.write('#\n') + f.write('# Statistics about activity on core %d of %s over %d usec ' + 'intervals:\n' % (options.core, options.node, + options.interval)) + f.write('# Time: End of the time interval\n') + f.write('# GroD: Data packets processed by GRO\n') + f.write('# GroG: Grant packets processed by GRO\n') + f.write('# SoftD: Data packets processed by SoftIRQ\n') + f.write('# SoftG: Grant packets processed by SoftIRQ\n'); + f.write('# SoftR: RESEND requests processed by SoftIRQ\n'); + f.write('# TxBusy: BUSY packets sent\n'); + f.write('# TxGrant: GRANT packets sent\n'); + + f.write('\n Time GroD GroG SoftD SoftG SoftR TxBusy TxGrant\n') + total = 0 + for interval in self.intervals: + if not 'softirq_data' in interval: + print('Interval: %s' % (interval)) + f.write('%8.1f %5d %5d %5d %5d %5d %5d %5d\n' + % (interval['time'], interval['gro_data'], + interval['gro_grant'], interval['softirq_data'], + interval['softirq_grant'], interval['resends'], + interval['busy'], interval['grant_sends'])) + +#------------------------------------------------ +# Analyzer: coregaps +#------------------------------------------------ +class AnalyzeCoregaps: + """ + Analyzes events on all cores to identify long gaps (periods of time + where there were no trace records for an individual core). + """ + + def __init__(self, dispatcher): + + # node -> dictionary mapping core -> time of most recent + # event on that core. + self.last_event = {} + + # node -> list of tuples: node and core + # identify a particular core, and start and length describe + # a gap where that core was idle. + self.node_gaps = {} + + self.gap_threshold = 5000 + + def init_trace(self, trace): + self.cur_node = {} + self.last_event[trace['node']] = self.cur_node + self.node_gaps[trace['node']] = [] + + def tt_all(self, trace, t, core, msg): + if core in self.cur_node: + gap = t - self.cur_node[core] + if gap > self.gap_threshold: + self.node_gaps[trace['node']].append([trace['node'], core, + self.cur_node[core], gap]) + self.cur_node[core] = t + + def output(self): + global options + max_per_node = 5 + + print('\n-------------------') + print('Analyzer: coregaps') + print('-------------------') + print('') + print('Longest time intervals where no timetrace events were recorded') + print('for a core (limit: %d gaps per node):' % (max_per_node)) + print('') + + # Identify gaps that occurred at the end of the traces (no + # terminating record for the gap). + for node, core_times in self.last_event.items(): + trace_last = traces[node]['last_time'] + for core in core_times.keys(): + gap = trace_last - core_times[core] + if gap > self.gap_threshold: + self.node_gaps[node].append([node, core, core_times[core], gap]) + + if len(self.node_gaps) == 0: + print('There were no gaps longer than %.1f ms' % + (self.gap_threshold/1000)) + return + + print('Node Core Start Length (ms)') + for node in get_sorted_nodes(): + if not node in self.last_event: + continue + node_cores = self.last_event[node] + gaps = sorted(self.node_gaps[node], key=lambda t : t[3], + reverse=True) + count = 0 + for gap in gaps: + if len(gap) != 4: + print('Bad gap: %s' % (gap)) + for gap_node, core, start, length in gaps: + print('%-10s %4d %9.1f %6.1f' % (gap_node, core, start, + length/1000)) + count += 1 + if count >= max_per_node: + break + +#------------------------------------------------ +# Analyzer: delay +#------------------------------------------------ +class AnalyzeDelay: + """ + Prints information about various delays, including delays associated + with packets at various stages and delays in waking up threads. With + --verbose, prints information about specific instances of long delays. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzePackets') + + # for gro->softirq handoffs + self.softirq_wakeups = [] + + # RPC id -> time when homa_rpc_handoff handed off that RPC to a thread. + self.rpc_handoffs = {} + + # RPC id -> time when homa_rpc_handoff queued the RPC. + self.rpc_queued = {} + + # for softirq->app handoffs (thread was polling) + self.app_poll_wakeups = [] + + # for softirq->app handoffs (thread was sleeping) + self.app_sleep_wakeups = [] + + # for softirq->app handoffs when RPC was queued + # (for request messages, i.e. on server) + self.app_queue_req_wakeups = [] + + # for softirq->app handoffs when RPC was queued + # (for response messages, i.e. on client) + self.app_queue_rsp_wakeups = [] + + # An entry exists for RPC id if a handoff occurred while a + # thread was polling + self.poll_success = {} + + def init_trace(self, trace): + # Target core id -> list of times when gro chose that core but + # SoftIRQ hasn't yet woken up + self.gro_handoffs = defaultdict(list) + + def tt_gro_handoff(self, trace, time, core, softirq_core): + self.gro_handoffs[softirq_core].append(time) + + def tt_softirq_start(self, trace, time, core): + if not self.gro_handoffs[core]: + return + self.softirq_wakeups.append([time - self.gro_handoffs[core][0], time, + trace['node']]) + self.gro_handoffs[core].pop(0) + + def tt_rpc_handoff(self, trace, time, core, id): + if id in self.rpc_handoffs: + print('Multiple RPC handoffs for id %s on %s: %9.3f and %9.3f' % + (id, trace['node'], self.rpc_handoffs[id], time), + file=sys.stderr) + self.rpc_handoffs[id] = time + + def tt_poll_success(self, trace, time, core, id): + self.poll_success[id] = time + + def tt_rpc_queued(self, trace, time, core, id): + self.rpc_queued[id] = time + + def tt_wait_found_rpc(self, trace, time, core, id, type, blocked): + if id in self.rpc_handoffs: + delay = time - self.rpc_handoffs[id] + if blocked: + self.app_sleep_wakeups.append([delay, time, trace['node']]) + else: + self.app_poll_wakeups.append([delay, time, trace['node']]) + del self.rpc_handoffs[id] + elif id in self.rpc_queued and blocked == 0: + if id & 0x1: + self.app_queue_req_wakeups.append([time - self.rpc_queued[id], + time, trace['node']]) + else: + self.app_queue_rsp_wakeups.append([time - self.rpc_queued[id], + time, trace['node']]) + del self.rpc_queued[id] + + def print_pkt_delays(self): + """ + Prints basic packet delay info, returns verbose output for optional + printing by caller. + """ + global packets, grants, options + + # Each of the following lists holds tuples for + # a particular stage of a packet's lifetime, where delay is the + # delay through that stage, pkt_id identifies the packet (rpc_id:offset) + # and time is when the delay ended. + short_to_nic = [] + short_to_gro = [] + short_to_softirq = [] + short_free = [] + short_total = [] + + long_to_nic = [] + long_to_gro = [] + long_to_softirq = [] + long_free = [] + long_total = [] + + grant_to_nic = [] + grant_to_gro = [] + grant_to_softirq = [] + grant_free = [] + grant_total = [] + + # Collect statistics about delays within individual packets. + # short_limit = get_mtu() + # if short_limit == 0: + # short_limit = 1000000 + short_limit = 1000 + for p, pkt in packets.items(): + if (pkt['msg_length'] != None) and (pkt['msg_length'] <= short_limit): + if ('xmit' in pkt) and ('nic' in pkt): + delay = pkt['nic'] - pkt['xmit'] + if delay > 0: + short_to_nic.append([delay, p, pkt['nic']]) + if ('nic' in pkt) and ('gro' in pkt): + delay = pkt['gro'] - pkt['nic'] + if delay > 0: + short_to_gro.append([delay, p, pkt['gro']]) + if ('gro' in pkt) and ('softirq' in pkt): + delay = pkt['softirq'] - pkt['gro'] + if delay > 0: + short_to_softirq.append([delay, p, pkt['softirq']]) + if ('softirq' in pkt) and ('xmit' in pkt): + delay = pkt['softirq'] - pkt['xmit'] + if delay > 0: + short_total.append([delay, p, pkt['softirq']]) + if ('nic' in pkt) and ('free_tx_skb' in pkt): + delay = pkt['free_tx_skb'] - pkt['nic'] + if delay > 0: + short_free.append([delay, p, pkt['free_tx_skb']]) + else: + if 'tso_length' in pkt: + if 'nic' in pkt: + delay = -1 + if 'qdisc_xmit' in pkt: + delay = pkt['nic'] - pkt['qdisc_xmit'] + elif 'xmit' in pkt: + delay = pkt['nic'] - pkt['xmit'] + if delay > 0: + long_to_nic.append([delay, p, pkt['nic']]) + if 'free_tx_skb' in pkt: + delay = pkt['free_tx_skb'] - pkt['nic'] + if delay > 0: + long_free.append([delay, p, pkt['free_tx_skb']]) + if ('nic' in pkt) and ('gro' in pkt): + delay = pkt['gro'] - pkt['nic'] + if delay > 0: + long_to_gro.append([delay, p, pkt['gro']]) + if ('gro' in pkt) and ('softirq' in pkt): + delay = pkt['softirq'] - pkt['gro'] + if delay > 0: + long_to_softirq.append([delay, p, pkt['softirq']]) + if ('softirq' in pkt) and ('xmit' in pkt): + delay = pkt['softirq'] - pkt['xmit'] + if delay > 0: + long_total.append([delay, p, pkt['softirq']]) + + for p, pkt in grants.items(): + if ('xmit' in pkt) and ('nic' in pkt): + delay = pkt['nic'] - pkt['xmit'] + if delay > 0: + grant_to_nic.append([delay, p, pkt['nic']]) + if ('nic' in pkt) and ('gro' in pkt): + delay = pkt['gro'] - pkt['nic'] + if delay > 0: + grant_to_gro.append([delay, p, pkt['gro']]) + if ('gro' in pkt) and ('softirq' in pkt): + delay = pkt['softirq'] - pkt['gro'] + if delay > 0: + grant_to_softirq.append([delay, p, pkt['softirq']]) + if ('softirq' in pkt) and ('xmit' in pkt): + delay = pkt['softirq'] - pkt['xmit'] + if delay > 0: + grant_total.append([delay, p, pkt['softirq']]) + if ('nic' in pkt) and ('free_tx_skb' in pkt): + delay = pkt['free_tx_skb'] - pkt['nic'] + if delay > 0: + grant_free.append([delay, p, pkt['free_tx_skb']]) + + print('\n----------------') + print('Analyzer: delay') + print('----------------') + print('Delays in the transmission and processing of data and grant packets') + print('(all times in usecs):') + print('Xmit: Time from ip*xmit call until driver queued packet for NIC') + print(' (for grants, includes time in homa_send_grants and') + print(' homa_xmit_control)') + print('Net: Time from when NIC received packet until GRO started processing') + print('SoftIRQ: Time from GRO until SoftIRQ started processing') + print('Free: Time from when NIC received packet until packet was returned') + print(' to Linux and freed') + print('Total: Total time from ip*xmit call until SoftIRQ processing') + + def print_pcts(data, label): + data.sort(key=lambda t : t[0]) + if not data: + print('%-10s 0' % (label)) + else: + print('%-10s %6d %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f' % (label, + len(data), data[0][0], data[10*len(data)//100][0], + data[50*len(data)//100][0], data[90*len(data)//100][0], + data[99*len(data)//100][0], data[len(data)-1][0], + list_avg(data, 0))) + print('\nPhase Count Min P10 P50 P90 P99 Max Avg') + print('-------------------------------------------------------------------------') + print('Data packets from messages <= %d bytes:' % (short_limit)) + print_pcts(short_to_nic, 'Xmit') + print_pcts(short_to_gro, 'Net') + print_pcts(short_to_softirq, 'SoftIRQ') + print_pcts(short_free, 'Free') + print_pcts(short_total, 'Total') + + print('\nData packets from messages > %d bytes:' % (short_limit)) + print_pcts(long_to_nic, 'Xmit') + print_pcts(long_to_gro, 'Net') + print_pcts(long_to_softirq, 'SoftIRQ') + print_pcts(long_free, 'Free') + print_pcts(long_total, 'Total') + + print('\nGrants:') + print_pcts(grant_to_nic, 'Xmit') + print_pcts(grant_to_gro, 'Net') + print_pcts(grant_to_softirq, 'SoftIRQ') + print_pcts(grant_free, 'Free') + print_pcts(grant_total, 'Total') + + # Handle --verbose for packet-related delays. + def print_worst(data, label): + global rpcs + + # The goal is to print about 20 packets covering the 98th-100th + # percentiles; we'll print one out of every "interval" packets. + result = '' + num_pkts = len(data) + interval = num_pkts//(50*20) + if interval == 0: + interval = 1 + for i in range(num_pkts-1, num_pkts - 20*interval, -interval): + if i < 0: + break + pkt = data[i] + recv_id = int(pkt[1].split(':')[0]) ^ 1 + dest = ' ???? ??' + if recv_id in rpcs: + rpc = rpcs[recv_id] + if 'gro_core' in rpc: + dest = '%10s %4d' % (rpc['node'], rpc['gro_core']) + else: + dest = '%10s ??' % (rpc['node']) + result += '%-8s %6.1f %20s %s %9.3f %5.1f\n' % (label, pkt[0], + pkt[1], dest, pkt[2], i*100/num_pkts) + return result + + verbose = 'Sampled packets with outlier delays:\n' + verbose += 'Phase: Phase of delay: Xmit, Net, or SoftIRQ\n' + verbose += 'Delay: Delay for this phase\n' + verbose += 'Packet: Sender\'s identifier for packet: rpc_id:offset\n' + verbose += 'Node: Node where packet was received\n' + verbose += 'Core: Core where homa_gro_receive processed packet\n' + verbose += 'EndTime: Time when phase completed\n' + verbose += 'Pctl: Percentile of this packet\'s delay\n\n' + verbose += ('Phase Delay (us) Packet RecvNode Core ' + 'EndTime Pctl\n') + verbose += ('--------------------------------------------------------' + '-------------\n') + + verbose += 'Data packets from messages <= %d bytes:\n' % (short_limit) + verbose += print_worst(short_to_nic, 'Xmit') + verbose += print_worst(short_to_gro, 'Net') + verbose += print_worst(short_to_softirq, 'SoftIRQ') + verbose += print_worst(short_total, 'Total') + + verbose += '\nData packets from messages > %d bytes:\n' % (short_limit) + verbose += print_worst(long_to_nic, 'Xmit') + verbose += print_worst(long_to_gro, 'Net') + verbose += print_worst(long_to_softirq, 'SoftIRQ') + verbose += print_worst(long_total, 'Total') + + verbose += '\nGrants:\n' + verbose += print_worst(grant_to_nic, 'Xmit') + verbose += print_worst(grant_to_gro, 'Net') + verbose += print_worst(grant_to_softirq, 'SoftIRQ') + verbose += print_worst(grant_total, 'Total') + + # Redo the statistics gathering, but only include the worst packets + # from each category. + if short_total: + min_short = short_total[98*len(short_total)//100][0] + max_short = short_total[99*len(short_total)//100][0] + else: + min_short = 0 + max_short = 0 + if long_total: + min_long = long_total[98*len(long_total)//100][0] + max_long = long_total[99*len(long_total)//100][0] + else: + min_long = 0 + max_long = 0 + if grant_total: + min_grant = grant_total[98*len(grant_total)//100][0] + max_grant = grant_total[99*len(grant_total)//100][0] + else: + min_grant = 0 + max_grant = 0 + + short_to_nic = [] + short_to_gro = [] + short_to_softirq = [] + short_free = [] + + long_to_nic = [] + long_to_gro = [] + long_to_softirq = [] + long_free = [] + + grant_to_nic = [] + grant_to_gro = [] + grant_to_softirq = [] + grant_free = [] + + for p, pkt in packets.items(): + if (not 'softirq' in pkt) or (not 'xmit' in pkt): + continue + total = pkt['softirq'] - pkt['xmit'] + if (pkt['msg_length'] != None) and (pkt['msg_length'] <= short_limit): + if (total < min_short) or (total > max_short): + continue + if ('xmit' in pkt) and ('nic' in pkt): + short_to_nic.append( + [pkt['nic'] - pkt['xmit'], p, pkt['nic']]) + if ('nic' in pkt) and ('gro' in pkt): + short_to_gro.append( + [pkt['gro'] - pkt['nic'], p, pkt['gro']]) + if ('gro' in pkt) and ('softirq' in pkt): + short_to_softirq.append( + [pkt['softirq'] - pkt['gro'], p, pkt['softirq']]) + if ('nic' in pkt) and ('free_tx_skb' in pkt): + short_free.append( + [pkt['free_tx_skb'] - pkt['nic'], p, pkt['free_tx_skb']]) + else: + if (total < min_long) or (total > max_long): + continue + if 'tso_length' in pkt: + if ('qdisc_xmit' in pkt) and ('nic' in pkt): + long_to_nic.append( + [pkt['nic'] - pkt['qdisc_xmit'], p, pkt['nic']]) + elif ('xmit' in pkt) and ('nic' in pkt): + long_to_nic.append( + [pkt['nic'] - pkt['xmit'], p, pkt['nic']]) + if ('nic' in pkt) and ('free_tx_skb' in pkt): + long_free.append( + [pkt['free_tx_skb'] - pkt['nic'], p, + pkt['free_tx_skb']]) + if ('nic' in pkt) and ('gro' in pkt): + long_to_gro.append( + [pkt['gro'] - pkt['nic'], p, pkt['gro']]) + if ('gro' in pkt) and ('softirq' in pkt): + long_to_softirq.append( + [pkt['softirq'] - pkt['gro'], p, pkt['softirq']]) + + for pkt in grants.values(): + if (not 'softirq' in pkt) or (not 'xmit' in pkt): + continue + total = pkt['softirq'] - pkt['xmit'] + if (total < min_grant) or (total > max_grant): + continue; + if ('xmit' in pkt) and ('nic' in pkt): + grant_to_nic.append( + [pkt['nic'] - pkt['xmit'], p, pkt['nic']]) + if ('nic' in pkt) and ('gro' in pkt): + grant_to_gro.append( + [pkt['gro'] - pkt['nic'], p, pkt['gro']]) + if ('gro' in pkt) and ('softirq' in pkt): + grant_to_softirq.append( + [pkt['softirq'] - pkt['gro'], p, pkt['softirq']]) + if ('nic' in pkt) and ('free_tx_skb' in pkt): + grant_free.append( + [pkt['free_tx_skb'] - pkt['nic'], p, pkt['free_tx_skb']]) + + def get_slow_summary(data): + if not data: + return " "*13 + data.sort(key=lambda t : t[0]) + return '%6.1f %6.1f' % (data[50*len(data)//100][0], + list_avg(data, 0)) + + print('\nPhase breakdown for P98-P99 packets:') + print(' Xmit Net SoftIRQ Free') + print(' Pkts P50 Avg P50 Avg P50 Avg P50 Avg') + print('---------------------------------------------------------------------------') + print('Short msgs %5d %s %s %s %s' % (len(short_to_nic), + get_slow_summary(short_to_nic), + get_slow_summary(short_to_gro), + get_slow_summary(short_to_softirq), + get_slow_summary(short_free))) + print('Long msgs %5d %s %s %s %s' % (len(long_to_nic), + get_slow_summary(long_to_nic), + get_slow_summary(long_to_gro), + get_slow_summary(long_to_softirq), + get_slow_summary(long_free))) + print('Grants %5d %s %s %s %s' % (len(grant_to_nic), + get_slow_summary(grant_to_nic), + get_slow_summary(grant_to_gro), + get_slow_summary(grant_to_softirq), + get_slow_summary(grant_free))) + return verbose + + def print_wakeup_delays(self): + """ + Prints basic info about thread wakeup delays, returns verbose output + for optional printing by caller. + """ + global options + + soft = self.softirq_wakeups + soft.sort() + app_poll = self.app_poll_wakeups + app_poll.sort() + app_sleep = self.app_sleep_wakeups + app_sleep.sort() + app_queue_req = self.app_queue_req_wakeups + app_queue_req.sort() + app_queue_rsp = self.app_queue_rsp_wakeups + app_queue_rsp.sort() + print('\nDelays in handing off from one core to another:') + print(' Count Min P10 P50 ' + 'P90 P99 Max Avg') + print('------------------------------------------------------------' + '------------------------') + + def print_percentiles(label, data): + num = len(data) + if num == 0: + print('%-30s %6d' % (label, 0)) + else: + print('%-30s %6d %5.1f %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f' + % (label, num, data[0][0], data[10*num//100][0], + data[50*num//100][0], data[90*num//100][0], + data[99*num//100][0], data[num-1][0], list_avg(data, 0))) + print_percentiles('GRO to SoftIRQ:', soft) + print_percentiles('SoftIRQ to polling app:', app_poll) + print_percentiles('SoftIRQ to sleeping app:', app_sleep) + print_percentiles('SoftIRQ to server via queue:', app_queue_req) + print_percentiles('SoftIRQ to client via queue:', app_queue_rsp) + + verbose = 'Worst-case handoff delays:\n' + verbose += 'Type Delay (us) End Time Node Pctl\n' + verbose += '------------------------------------------------------------------\n' + + def print_worst(label, data): + # The goal is to print about 10 records covering the 98th-100th + # percentiles; we'll print one out of every "interval" packets. + num = len(data) + interval = num//(50*10) + if interval == 0: + interval = 1 + result = '' + for i in range(num-1, num - 10*interval, -interval): + if i < 0: + break + time, delay, node = data[i] + result += '%-30s %6.1f %9.3f %10s %5.1f\n' % ( + label, time, delay, node, + 100*i/(num-1) if num > 1 else 100) + return result + + verbose += print_worst('GRO to SoftIRQ', soft) + verbose += print_worst('SoftIRQ to polling app', app_poll) + verbose += print_worst('SoftIRQ to sleeping app', app_sleep) + verbose += print_worst('SoftIRQ to server via queue', app_queue_req) + verbose += print_worst('SoftIRQ to client via queue', app_queue_rsp) + return verbose + + def output(self): + global options + + delay_verbose = self.print_pkt_delays() + wakeup_verbose = self.print_wakeup_delays() + if options.verbose: + print('') + print(delay_verbose, end='') + print('') + print(wakeup_verbose, end='') + +#------------------------------------------------ +# Analyzer: filter +#------------------------------------------------ +class AnalyzeFilter: + """ + Select packets based on various criteria, then print summary statistics + for those packets. The following command-line options are used to filter + the packets: --tx-node, --rx-node, --tx-qid, --msglen, --grolat, --segs, + --pkt_type, and --filter. If --verbose is specified then the matching + packets are printed in detail. The --sort option selects a column to + use for sorting the packets; it must be one of Xmit, Nic, Gro, SoftIRQ, + or Free (default is Xmit). + """ + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzePackets') + return + + def filter_short_tcp(self, pkt): + """ + Returns True if pkt is a short TCP packet: it has some data, but + no more than 1500 bytes. + """ + if pkt['type'] == '': + return False + if not 'tso_length' in pkt: + return False + length = pkt['tso_length'] + # print('\nLength %d: pkt %s' % (length, pkt)) + return length > 10 and length <= 1500 + + def filter_packets(self, options): + """ + Returns a list containing all of the packets that match options. + In addition, all returned packets will have valid 'xmit' and 'gro' + fields. + + options: A dictionary of option values (see class doc for list of + valid options); usually contains the command-line options. + """ + global packets, tcp_packets, grants, rpcs + + filter_func = None + if options.filter != None: + name = 'filter_' + options.filter + filter_func = getattr(self, name, None) + if filter_func == None or not callable(filter_func): + raise Exception('Couldn\'t find a filter method %s in the ' + '%s class' % (name, self.__class__.__name__)) + + if options.msglen != None: + min_length, max_length = get_range(options.msglen, + option_name='--msglen', one_value=True) + if max_length == None: + max_length = min_length + min_length = 0 + + if options.grolat != None: + min_gro, max_gro = get_range(options.grolat, + option_name='--grolat', parse_float=True, one_value=True) + if max_gro == None: + max_gro = 1e20 + + pkt_dict = {} + for name in options.pkt_types.split(): + if name == 'all': + pkt_dict['data'] = packets.values() + pkt_dict['tcp'] = tcp_packets.values() + pkt_dict['grant'] = grants.values() + elif name == 'data': + pkt_dict['data'] = packets.values() + elif name == 'tcp': + pkt_dict['tcp'] = tcp_packets.values() + elif name == 'grant': + pkt_dict['grant'] = grants.values() + else: + raise Exception('Unknown packet type \'%s\'; must be \'data\', ' + '\'tcp\', or \'grant\'' % (name)) + + result = [] + for pkt_list in pkt_dict.values(): + for pkt in pkt_list: + if not 'gro' in pkt or not 'xmit' in pkt: + continue + if options.tx_node != None and options.tx_node != pkt['tx_node']: + continue + if options.rx_node != None and options.rx_node != pkt['rx_node']: + continue + if options.tx_core != None and options.tx_core != pkt['tx_core']: + continue + if options.rx_core != None and options.rx_core != pkt['gro_core']: + continue + if options.tx_qid != None and (not 'tx_qid' in pkt or + options.tx_qid != pkt['tx_qid']): + continue + if options.msglen != None: + if not 'msg_length' in pkt: + continue + length = pkt['msg_length'] + if length < min_length or length > max_length: + continue + if options.grolat != None: + latency = pkt['gro'] - pkt['xmit'] + if latency < min_gro or latency > max_gro: + continue + if not options.segs and not 'tso_length' in pkt: + continue + if filter_func != None and not filter_func(pkt): + continue + result.append(pkt) + return result + + def output(self): + global options + + pkts = self.filter_packets(options) + + print('\n----------------') + print('Analyzer: filter') + print('----------------\n') + if not pkts: + print('No packets matched filters') + return + print('%d packets were selected using the following filters:' % + (len(pkts))) + print(' --pkt_types %s' % (options.pkt_types)) + if options.tx_node != None: + print(' --tx-node %s' % (options.tx_node)) + if options.tx_core != None: + print(' --tx-core %d' % (options.tx_core)) + if options.tx_qid != None: + print(' --tx-qid %d' % (options.tx_qid)) + if options.rx_node != None: + print(' --rx-node %s' % (options.rx_node)) + if options.rx_core != None: + print(' --rx-core %s' % (options.rx_core)) + if options.segs: + print(' --segs True') + if options.msglen: + print(' --msglen %s' % (options.msglen)) + if options.grolat: + print(' --grolat %s' % (options.grolat)) + if options.filter != None: + print(' --filter %s' % (options.filter)) + + nic = [] + gro = [] + softirq = [] + free = [] + total = [] + + for pkt in pkts: + if 'xmit' in pkt and 'nic' in pkt: + nic.append(pkt['nic'] - pkt['xmit']) + if 'nic' in pkt and 'gro' in pkt: + gro.append(pkt['gro'] - pkt['nic']) + if 'gro' in pkt and 'softirq' in pkt: + softirq.append(pkt['softirq'] - pkt['gro']) + if 'softirq' in pkt and 'xmit' in pkt: + total.append(pkt['softirq'] - pkt['xmit']) + elif 'gro' in pkt and 'xmit' in pkt: + total.append(pkt['gro'] - pkt['xmit']) + if 'nic' in pkt and 'free_tx_skb' in pkt: + free.append(pkt['free_tx_skb'] - pkt['nic']) + + print('\nDelays (in usecs) for each of the following phases of the ' + 'selected packets\'') + print('lifetimes:') + print('Xmit: Time from ip*xmit call until NIC handoff') + print('Net: Time from NIC handoff until GRO started processing') + print('SoftIRQ: Time from GRO until SoftIRQ started processing') + print('Free: Time from NIC handoff until packet was returned to') + print(' Linux and freed') + print('Total: Total time from ip*xmit call until SoftIRQ started') + print() + + def print_pcts(data, label): + data.sort() + if not data: + print('%-10s 0' % (label)) + else: + print('%-10s %6d %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f' % (label, + len(data), data[0], data[10*len(data)//100], + data[50*len(data)//100], data[90*len(data)//100], + data[99*len(data)//100], data[len(data)-1], + sum(data)/len(data))) + + print('Phase Count Min P10 P50 P90 P99 Max Avg') + print('------------------------------------------------------------------') + print_pcts(nic, 'Xmit') + print_pcts(gro, 'Net') + print_pcts(softirq, 'SoftIRQ') + print_pcts(free, 'Free') + print_pcts(total, 'Total') + + if not options.verbose: + return + + print() + print('# Details of the selected packets, sorted by \'%s\':' % + (options.sort)) + print(print_pkts(sort_pkts(pkts, options.sort), header=True), end='') + +#------------------------------------------------ +# Analyzer: grantlock +#------------------------------------------------ +class AnalyzeGrantlock: + """ + Analyzes contention for the grant lock, which controls centrally + managed data about grantable RPCs. + """ + + def __init__(self, dispatcher): + + # Node name -> dictionary with data about that node: + # last_block: core -> last time that core blocked on the lock + # block_time: core -> total time that core was blocked on the lock + # total_hold: total time this core spent holding the lock + # hold_count: number of distinct intervals with this core held the lock + # max_hold: max amount of time lock was held before releasing + # max_time: time when lock was released after max_hold + # + self.nodes = {} + + # One record for each interval where a core blocked for the grant + # lock: where time is when the lock was + # finally acquired, duration is how long the core had to wait, and + # node and core indicate where the block occurred. + self.block_intervals = [] + + # One record for each interval where it can be determined that the + # lock was held by one core: , where + # time is when the lock was acquired, duration is the elapsed time + # until the next core got the lock, and core is the core that + # acquired the lock. + self.hold_times = [] + + # Number of cores currently blocked on the lock. + self.blocked_cores = 0 + + # The last time that a core unblocked after waiting for the lock + # when at least one other core was waiting for the lock (used to + # compute hold_times). + self.last_unblock = None + + # The core where last_unblock occurred. + self.last_core = None + + def init_trace(self, trace): + self.node = { + 'last_block': {}, + 'block_times': defaultdict(lambda: 0), + 'block_time': 0, + 'total_hold': 0, + 'hold_count': 0, + 'max_hold': 0, + 'max_time': 0} + self.nodes[trace['node']] = self.node + self.blocked_cores = 0 + self.last_unblock = None + + def tt_lock_wait(self, trace, time, core, event, lock_name): + if lock_name != 'grant': + return + if event == 'beginning': + # Core blocked on lock + self.node['last_block'][core] = time + self.blocked_cores += 1 + else: + # Blocked core acquired lock + if core in self.node['last_block']: + duration = time - self.node['last_block'][core] + self.node['block_times'][core] += duration + self.block_intervals.append([time, duration, trace['node'], + core]) + if self.last_unblock != None: + hold = time - self.last_unblock + self.hold_times.append([self.last_unblock, hold, + trace['node'], self.last_core]) + self.node['total_hold'] += hold + self.node['hold_count'] += 1 + if hold > self.node['max_hold']: + self.node['max_hold'] = hold + self.node['max_time'] = time + self.blocked_cores -= 1 + if self.blocked_cores > 0: + self.last_unblock = time + self.last_core = core + else: + self.last_unblock = None + + def output(self): + global traces + + print('\n-----------------------') + print('Analyzer: grantlock') + print('-----------------------\n') + + print('Per-node statistics on usage of the grant lock:') + print('Node: Name of node') + print('Blocked: Fraction of core(s) wasted while blocked on the lock ' + '(1.0 means') + print(' that on average, one core was blocked on the lock)') + print('MaxCore: The core that spent the largest fraction of its time ' + 'blocked on') + print(' the grant lock') + print('MaxBlk: Fraction of time that MaxCore was blocked on the lock') + print('HoldFrac: Fraction of time this node held the lock (note: ' + 'hold times ') + print(' can be computed only when there are 2 or more ' + 'waiting cores,') + print(' so this is an underestimate)') + print('AvgHold: Average time that lock was held before releasing') + print('MaxHold: Largest time that lock was held before releasing') + print('MaxTime: Time when MaxHold ended') + print('') + + # + data = [] + for name, node in self.nodes.items(): + total_block = 0 + max_block_time = -1 + max_block_core = -1 + for core in sorted(node['block_times']): + t = node['block_times'][core] + total_block += t + if t > max_block_time: + max_block_time = t + max_block_core = core + data.append([name, total_block, max_block_time, max_block_core]) + + print('Node Blocked MaxCore MaxBlk HoldFrac AvgHold MaxHold MaxTime') + print('--------------------------------------------------------------------') + for name, total_block, max_block, max_block_core in sorted( + data, key=lambda t : t[1], reverse = True): + elapsed = traces[name]['elapsed_time'] + node = self.nodes[name] + if node['hold_count'] > 0: + hold_info = ' %4.2f %6.2f %7.2f %10.3f' % ( + node['total_hold']/elapsed, + node['total_hold']/node['hold_count'], + node['max_hold'], node['max_time']) + else: + hold_info = ' 0.00 N/A N/A N/A' + print('%-10s %5.2f C%02d %6.3f %s' % (name, total_block/elapsed, + max_block_core, max_block/elapsed, hold_info)) + + print('\nLongest times a core had to wait for the grant lock:') + print(' EndTime BlockTime Node Core') + self.block_intervals.sort(key=lambda t : t[1], reverse=True) + for i in range(len(self.block_intervals)): + if i >= 10: + break + time, duration, node, core = self.block_intervals[i] + print('%9.3f %7.1f %10s %4d' % (time, duration, node, core)) + + print('\nLongest periods that one core held the grant lock:') + print('StartTime HoldTime Node Core') + self.hold_times.sort(key=lambda t : t[1], reverse=True) + for i in range(len(self.hold_times)): + if i >= 10: + break + time, duration, node, core = self.hold_times[i] + print('%9.3f %7.1f %10s %4d' % (time, duration, node, core)) + +#------------------------------------------------ +# Analyzer: grants +#------------------------------------------------ +class AnalyzeGrants: + """ + Generates statistics about the granting mechanism, such as the number of + grants outstanding for incoming messages and the number of granted bytes + available for outgoing messages. If --data is specified, then two files + are created for each node in the data directory, with names + "grants_rx_" and "grants_tx_". These files contain information + about all incoming/outgoing RPCs with outstanding/available grants in each + time interval. In addition, statistics are generated about the time spent + in homa_grant_check_rpc. + """ + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzeIntervals') + + # Node name -> total time spent in homa_grant_check_rpc on that node. + self.node_check_time = defaultdict(lambda : 0) + + # Node name -> total time spent acquiring and holding the grant lock + # while validating/updating grant priorities in homa_grant_check_rpc + # on that node. + self.node_lock_recalc_time = defaultdict(lambda : 0) + + # Node name -> total time spent acquiring and holding the grant lock + # while checking RPCs other than the calling one in homa_grant_check_rpc + # on that node. + self.node_lock_needy_time = defaultdict(lambda : 0) + + # Node name -> total time spent sending grants during homa_grant_check_rpc. + self.node_grant_send_time = defaultdict(lambda : 0) + + # Node name -> number of times the grant lock was acquired by + # homa_grant_check_rpc on that node. + self.node_locks = defaultdict(lambda : 0) + + # Node name -> number of calls to homa_grant_check_rpc + self.node_checks = defaultdict(lambda : 0) + + # Node name -> count of grants sent in calls to homa_grant_check_rpc + self.node_grants_sent = defaultdict(lambda : 0) + + def init_trace(self, trace): + # Core -> start time of active call to homa_grant_check_rpc (if any) + self.core_check_start = {} + + # Core -> time when homa_grant_check_rpc started acquiring the + # grant lock to valid priority order (if any) + self.core_lock_recalc_start = {} + + # Core -> time when homa_grant_check_rpc started acquiring the + # grant lock because RPCs other than the invoking one needed to + # be checked for possible grants (if any) + self.core_lock_needy_start = {} + + # Core -> time of first grant sent by current call to + # homa_grant_check_rpc (only valid if homa_grant_check_rpc in progress) + self.core_first_grant_send = {} + + def tt_grant_check_start(self, trace, t, core, id): + self.node_checks[trace['node']] += 1 + self.core_check_start[core] = t + + def tt_grant_check_lock_recalc(self, trace, t, core, id): + node = trace['node'] + self.core_lock_recalc_start[core] = t + + def tt_grant_check_lock_needy(self, trace, t, core, id, rank, active): + node = trace['node'] + self.core_lock_needy_start[core] = t + + def tt_grant_check_unlock(self, trace, t, core, id): + node = trace['node'] + if core in self.core_lock_recalc_start: + self.node_locks[node] += 1 + self.node_lock_recalc_time[node] += (t - + self.core_lock_recalc_start[core]) + del self.core_lock_recalc_start[core] + elif core in self.core_lock_needy_start: + self.node_locks[node] += 1 + self.node_lock_needy_time[node] += (t - + self.core_lock_needy_start[core]) + del self.core_lock_needy_start[core] + + def tt_send_grant(self, trace, t, core, id, offset, priority, increment): + if not core in self.core_check_start: + return + self.node_grants_sent[trace['node']] += 1 + if not core in self.core_first_grant_send: + self.core_first_grant_send[core] = t + + def tt_grant_check_done(self, trace, t, core, id): + node = trace['node'] + if core in self.core_first_grant_send: + grant = self.core_first_grant_send[core] + self.node_grant_send_time[node] += (t - grant) + del self.core_first_grant_send[core] + if core in self.core_check_start: + self.node_check_time[node] += t - self.core_check_start[core] + del self.core_check_start[core] + + def print_grant_check_stats(self): + print('\nStatistics about the function homa_grant_check_rpc:') + print('Node: Name of node') + print('Checks: Rate of calling homa_grant_check_rpc (k/sec)') + print('CUsec: Average execution time in homa_grant_check_rpc') + print('CCores: Average active cores in homa_grant_check_rpc') + print('LPer: Average # of acquisitions of the grant lock per call to ') + print(' homa_grant_check_rpc') + print('RUsec: Average time spent acquiring/holding the grant ' + 'lock for priority ') + print(' recalculations') + print('RCores: Average cores acquiring/hold the grant lock for ' + 'priority recalculation') + print('NUsec: Average time spent acquiring/holding the grant ' + 'lock while considering ') + print(' needy RPCs other than the calling one') + print('NCores: Average cores acquiring/hold the grant lock while ' + 'considering needy') + print(' RPCs other than the calling one') + print('GPer: Average grants sent per call to homa_grant_check_rpc') + print('GUsec: Average time to send a grant') + print('GCores: Average cores actively sending grants from within ' + 'homa_grant_check_rpc') + + print('') + print('Node Checks CUsec CCores LPer RUsec RCores ' + 'NUsec NCores GPer GUSec GCores') + print('----------------------------------------------------' + '--------------------------------') + for node in get_sorted_nodes(): + checks = self.node_checks[node] + locks = self.node_locks[node] + grants = self.node_grants_sent[node] + recalc_time = self.node_lock_recalc_time[node] + needy_time = self.node_lock_needy_time[node] + grant_time = self.node_grant_send_time[node] + check_time = self.node_check_time[node] + elapsed = traces[node]['elapsed_time'] + print('%-10s %5.1f %5.2f %5.2f ' % (node, 1000*checks/elapsed, + check_time/checks if checks else 0, check_time/elapsed), + end='') + print('%5.2f %6.3f %5.2f %6.3f %5.2f ' % (locks/checks if checks else 0, + recalc_time/checks if checks else 0, recalc_time/elapsed, + needy_time/checks if checks else 0, needy_time/elapsed), + end='') + print('%5.2f %5.2f %5.2f' % (grants/checks if checks else 0, + grant_time/grants if grants else 0, grant_time/elapsed)) + + def get_events(self): + """ + Returns a list of events of interest for this analyzer. Elements + in the list have one of the following forms: + + Time is when data packet was passed to ip_queue_xmit on node. + Id is an RPC id, offset is the offset of first data byte in packet + and length is the number of data bytes in the packet. + + Similar to "txdata" except describes when homa_softirq processed + the packet on the receiving node. + + Time is when a grant packet was created (in homa_create_grants) on + node, id identifies the RPC for the grant, and offset is the byte + just after the last one now granted for the RPC. Used for + unscheduled bytes as well as actual grants. + + Similar to "txgrant" except the time is when homa_grant_pkt + processed the packet on the receiving node. + The return value will be sorted by time. + """ + global rpcs + + events = [] + for id, rpc in rpcs.items(): + node = rpc['node'] + + if 'sendmsg' in rpc: + if id^1 in rpcs: + other = rpcs[id^1] + else: + other = {} + if 'unsched' in other: + unsched = other['unsched'] + else: + unsched = max_unsched; + if 'out_length' in rpc: + if rpc['out_length'] < max_unsched: + unsched = rpc['out_length'] + events.append([rpc['sendmsg'], 'rxgrant', node, id, unsched]) + + if 'send_data' in rpc: + for time, offset, length in rpc['send_data']: + events.append([time, 'txdata', node, id, offset, length]) + if rpc['softirq_data']: + for time, offset in rpc['softirq_data']: + events.append([time, 'rxdata', node, id, offset, + get_recv_length(offset, rpc['in_length'])]) + if 'send_grant' in rpc: + for time, offset, priority, increment in rpc['send_grant']: + events.append([time, 'txgrant', node, id, offset]) + if 'softirq_grant' in rpc: + for time, offset in rpc['softirq_grant']: + events.append([time, 'rxgrant', node, id, offset]) + return sorted(events) + + class RpcDict(dict): + """ + id -> dict for each RPC; dict contains any or all of: + rx_data_offset: Offset just after last byte received + tx_data_offset: Offset just after last byte sent; -1 means no data + was ever sent for this message, so we ignore it + for tx stats + rx_grant_offset: Offset just after last incoming byte granted + tx_grant_offset: Offset just after last outgoing byte granted + rx_length: Length of incoming message for this RPC, or -1 if + not known + tx_length: Length of outgoing message for this RPC, or -1 if + not known + """ + def __missing__(self, key): + global rpcs + self[key] = {'rx_data_offset': 0, 'tx_data_offset': -1, + 'rx_grant_offset': 0, 'tx_grant_offset': 0} + record = self[key] + rpc = rpcs[key] + if rpc['in_length'] != None: + record['rx_length'] = rpc['in_length'] + else: + record['rx_length'] = -1 + if 'out_length' in rpc: + record['tx_length'] = rpc['out_length'] + else: + record['tx_length'] = -1 + if rpc['send_data']: + record['tx_data_offset'] = rpc['send_data'][0][1] + return record + + class NodeDict(dict): + """ + node -> dict for each node; dict contains: + name: Name of the node + rx_bytes: Total bytes across all incoming messages that + have been granted but not yet received + tx_bytes: Total bytes across all outgoing messages that + have been granted but not yet sent + rx_rpcs: Id -> True for all incoming messages with granted + bytes that haven't yet been received + tx_rpcs: Id -> True for all outgoing messages with granted + bytes that haven't yet been sent + *_integral: For each of the above, sum of value*dt + tx_times: Array: element n is total time when tx_msgs was n + rx_times: Array: element n is total time when rx_msgs was n + prev_time: The last time any of the stats above were changed + rx_data: Accumulates detailed grant info for incoming messages + when --data is specified; one line per interval + tx_data: Accumulates detailed grant info for outgoing messages + when --data is specified; one line per interval + """ + def __missing__(self, key): + global traces + self[key] = {'name': key, 'rx_bytes': 0, 'tx_bytes': 0, + 'rx_rpcs': {}, 'tx_rpcs': {}, + 'rx_bytes_integral': 0, 'tx_bytes_integral': 0, + 'rx_msgs_integral': 0, 'tx_msgs_integral': 0, + 'tx_times': [0, 0, 0, 0], 'rx_times': [0, 0, 0, 0], + 'prev_time': traces[key]['first_time'], + 'rx_data': '', 'tx_data': ''} + return self[key] + + def check_node(self, node, local_rpcs): + """ + Check consistency of node with current state of RPCs. + + node: Node to check (element of NodeDict). + local_rpcs: RpcDict containing information about RPCs. + """ + + global rpcs + rx_bytes = 0 + rx_msgs = 0 + tx_bytes = 0 + tx_msgs = 0 + node_name = node['name'] + for id, rpc in local_rpcs.items(): + if rpcs[id]['node'] != node_name: + continue + delta = rpc['rx_grant_offset'] - rpc['rx_data_offset'] + if delta > 0: + rx_msgs += 1 + rx_bytes += delta + delta = rpc['tx_grant_offset'] - rpc['tx_data_offset'] + if delta > 0: + tx_msgs += 1 + tx_bytes += delta + if rx_msgs != len(node['rx_rpcs']): + print('Error for RPC %d rx_msgs" expected %d, got %d' % + (id, rx_msgs, len(node['rx_rpcs']))) + if rx_bytes != node['rx_bytes']: + print('Error for RPC %d rx_bytes" expected %d, got %d' % + (id, rx_bytes, node['rx_bytes'])) + if tx_msgs != len(node['tx_rpcs']): + print('Error for RPC %d tx_msgs" expected %d, got %d' % + (id, tx_msgs, len(node['tx_rpcs']))) + if tx_bytes != node['tx_bytes']: + print('Error for RPC %d tx_bytes" expected %d, got %d' % + (id, tx_bytes, node['tx_bytes'])) + + def rx_info(self, node, local_rpcs): + """ + Return a line of text describing the current state of grants for + incoming messages for a node. + + node: Node of interest (element of NodeDict). + local_rpcs: RpcDict containing information about RPCs. + """ + + records = [] + for id in node['rx_rpcs']: + rpc = local_rpcs[id] + length = rpc['rx_length'] + data_offset = rpc['rx_data_offset'] + grant_offset = rpc['rx_grant_offset'] + outstanding = grant_offset - data_offset + if outstanding < 0: + outstanding = 0 + if length >= 0: + records.append([length - data_offset, id, outstanding]) + else: + records.append([1e20, id, outstanding]) + records.sort(reverse=True) + result = '' + for remaining, id, outstanding in records: + if remaining == 1e20: + result += '%12d ?? %6d' % (id, outstanding) + else: + result += '%12d %7d %6d' % (id, remaining, outstanding) + return result + + def tx_info(self, node, local_rpcs): + """ + Return a line of text describing the current state of grants for + outgoing messages for a node. + + node: Node of interest (element of NodeDict). + local_rpcs: RpcDict containing information about RPCs. + """ + + records = [] + for id in node['tx_rpcs']: + rpc = local_rpcs[id] + length = rpc['tx_length'] + data_offset = rpc['tx_data_offset'] + grant_offset = rpc['tx_grant_offset'] + available = grant_offset - data_offset + if available < 0: + available = 0 + if length >= 0: + records.append([length - data_offset, id, available]) + else: + records.append([1e20, id, available]) + records.sort(reverse=True) + result = '' + for remaining, id, available in records: + if remaining == 1e20: + result += '%12d ?? %6d' % (id, available) + else: + result += '%12d %7d %6d' % (id, remaining, available) + return result + + def analyze(self): + global options, rpcs, intervals + + events = self.get_events() + interval_end = get_first_interval_end() + + self.local_rpcs = self.RpcDict() + self.local_nodes = self.NodeDict() + + count = 0 + for event in events: + t, op, node_name, id, offset = event[0:5] + rpc = self.local_rpcs[id] + node = self.local_nodes[node_name] + + while (t > interval_end) and options.data: + for name2, node2 in self.local_nodes.items(): + interval = get_interval(name2, interval_end) + if interval == None: + continue + interval['rx_grants'] = len(node2['rx_rpcs']) + interval['rx_grant_bytes'] = node2['rx_bytes'] + interval['rx_grant_info'] = self.rx_info(node2, self.local_rpcs) + interval['tx_grant_info'] = self.tx_info(node2, self.local_rpcs) + interval_end += options.interval + + # Update integrals. + delta = t - node['prev_time'] + rx_msgs = len(node['rx_rpcs']) + tx_msgs = len(node['tx_rpcs']) + node['rx_bytes_integral'] += node['rx_bytes'] * delta + node['tx_bytes_integral'] += node['tx_bytes'] * delta + node['rx_msgs_integral'] += rx_msgs * delta + node['tx_msgs_integral'] += tx_msgs * delta + if rx_msgs < 4: + node['rx_times'][rx_msgs] += delta + if tx_msgs < 4: + node['tx_times'][tx_msgs] += delta + node['prev_time'] = t + + # Update state + if op == 'rxdata': + offset += event[5] + old_offset = rpc['rx_data_offset'] + if offset <= old_offset: + continue + rpc['rx_data_offset'] = offset + grant = rpc['rx_grant_offset'] + if old_offset < grant: + if offset >= grant: + node['rx_bytes'] -= grant - old_offset + del node['rx_rpcs'][id] + else: + node['rx_bytes'] -= offset - old_offset + if 0 and node_name == 'node1': + print('%9.3f id %12d old_offset %7d new_offset %7d grant %7d, ' + 'rx_bytes %7d, rx_msgs %7d' + % (t, id, old_offset, offset, grant, + node['rx_bytes'], len(node['rx_rpcs']))) + + if op == 'txdata': + offset += event[5] + old_offset = rpc['tx_data_offset'] + if (offset < old_offset) or (old_offset < 0): + continue + grant = rpc['tx_grant_offset'] + rpc['tx_data_offset'] = offset + if old_offset < grant: + if offset >= grant: + node['tx_bytes'] -= grant - old_offset + del node['tx_rpcs'][id] + else: + node['tx_bytes'] -= offset - old_offset + if 0 and node_name == 'node1': + print('%9.3f: data %d, state %s' % (t, offset, rpc)) + + if op == 'rxgrant': + old_grant = rpc['tx_grant_offset'] + if (offset < old_grant) or (rpc['tx_data_offset'] < 0): + continue + data = rpc['tx_data_offset'] + rpc['tx_grant_offset'] = offset + if offset > data: + if old_grant > data: + node['tx_bytes'] += offset - old_grant + else: + node['tx_bytes'] += offset - data + node['tx_rpcs'][id] = True + + if op == 'txgrant': + old_grant = rpc['rx_grant_offset'] + data = rpc['rx_data_offset'] + if offset < old_grant: + continue + rpc['rx_grant_offset'] = offset + if offset > data: + if old_grant > data: + node['rx_bytes'] += offset - old_grant + else: + node['rx_bytes'] += offset - data + node['rx_rpcs'][id] = True + if 0 and node_name == 'node1': + print('%9.3f: grant %d, state %s' % (t, offset, rpc)) + + if 0 and node_name == 'node1': + count += 1 + if (count % 10) == 0: + self.check_node(node, self.local_rpcs) + + def output(self): + print('\n-------------------') + print('Analyzer: grants') + print('-------------------\n') + + print('Grant statistics:') + print('Node: Name of node') + print('InMsgs: Average number of incoming messages with outstanding grants') + print('InN: Fraction of time when N incoming messages had outstanding grants') + print('InKB: Average KB of outstanding grants across all incoming messages') + print('OutMsgs: Average number of outgoing messages with available grants') + print('OutN: Fraction of time when N outgoing messages had available grants') + print('OutKB: Average KB of available grants across all outgoing messages') + print('') + print('Node InMsgs In0 In1 In2 In3 InKB OutMsgs Out0 Out1 Out2 Out3 OutKB') + print('-------------------------------------------------------------------------------') + total_in_msgs = 0 + total_in_bytes = 0 + total_out_msgs = 0 + total_out_bytes = 0 + for n in get_sorted_nodes(): + node = self.local_nodes[n] + total_time = node['prev_time'] - traces[n]['first_time'] + total_in_msgs += node['rx_msgs_integral']/total_time + total_in_bytes += node['rx_bytes_integral']/total_time + total_out_msgs += node['tx_msgs_integral']/total_time + total_out_bytes += node['tx_bytes_integral']/total_time + if total_time == 0: + print('%-10s No data available' % (node)) + print('%-10s %6.1f %4.2f %4.2f %4.2f %4.2f %6.1f %6.1f %4.2f ' + '%4.2f %4.2f %4.2f %6.1f' % (n, + node['rx_msgs_integral']/total_time, + node['rx_times'][0]/total_time, + node['rx_times'][1]/total_time, + node['rx_times'][2]/total_time, + node['rx_times'][3]/total_time, + node['rx_bytes_integral']*1e-3/total_time, + node['tx_msgs_integral']/total_time, + node['tx_times'][0]/total_time, + node['tx_times'][1]/total_time, + node['tx_times'][2]/total_time, + node['tx_times'][3]/total_time, + node['tx_bytes_integral']*1e-3/total_time)) + print('Average %6.1f %6.1f %6.1f ' + ' %6.1f' % ( + total_in_msgs/len(self.local_nodes), + total_in_bytes/len(self.local_nodes)*1e-3, + total_out_msgs/len(self.local_nodes), + total_out_bytes/len(self.local_nodes)*1e-3)) + + # Create data files. + if options.data: + for name, node in self.local_nodes.items(): + f = open('%s/grants_rx_%s.dat' % (options.data, name), 'w') + f.write('# Node: %s\n' % (name)) + f.write('# Generated at %s.\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + f.write('# Incoming messages with outstanding grants, as a ' + 'function of time.\n') + f.write('# Time: End of the time interval\n') + f.write('# IdN: Rpc identifier\n') + f.write('# RemN: Number of bytes that have not yet been ' + 'received for\n') + f.write('# the message\n') + f.write('# GrantN: Number of bytes that have been granted ' + 'but data has\n') + f.write('# not yet arrived\n') + f.write('\n') + f.write(' Time Id1 Rem1 Grant1 ' + 'Id2 Rem2 Grant2 Id3 Rem3 Grant3\n') + for interval in intervals[name]: + if not 'rx_grant_info' in interval: + continue + f.write('%7.1f %s\n' % (interval['time'], + interval['rx_grant_info'])) + f.close() + + f = open('%s/grants_tx_%s.dat' % (options.data, name), 'w') + f.write('# Node: %s\n' % (name)) + f.write('# Generated at %s.\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + f.write('# Outgoing messages with available grants, as a ' + 'function of time.\n') + f.write('# Time: End of the time interval\n') + f.write('# IdN: Rpc identifier\n') + f.write('# RemN: Number of bytes that have not yet been ' + 'transmitted for\n') + f.write('# the message\n') + f.write('# GrantN: Number of bytes that have been granted ' + 'but data has\n') + f.write('# not yet been transmitted\n') + f.write('\n') + f.write(' Time Id1 Rem1 Grant1 ' + 'Id2 Rem2 Grant2 Id3 Rem3 Grant3\n') + for interval in intervals[name]: + if not 'tx_grant_info' in interval: + continue + f.write('%7.1f %s\n' % (interval['time'], + interval['tx_grant_info'])) + f.close() + + # Print stats related to homa_grant_check_rpc. + self.print_grant_check_stats() + +#------------------------------------------------ +# Analyzer: handoffs +#------------------------------------------------ +class AnalyzeHandoffs: + """ + Analyzes handoff delays for incoming messages (time from when + homa_rpc_handoff was called until homa_wait_for_message received + the message). + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeRpcs') + return + + def output(self): + global rpcs + + # node name -> list of delays in cases where there was a thread + # waiting for the message; separate info for requests and responses + node_req_handoffs = defaultdict(list) + node_resp_handoffs = defaultdict(list) + + # node name -> list of delays where messages had to be queued; + # separate info for requests and responses + node_req_queued = defaultdict(list) + node_resp_queued = defaultdict(list) + + for id, rpc in rpcs.items(): + if not 'found' in rpc: + continue + if 'handoff' in rpc: + delay = rpc['found'] - rpc['handoff'] + if id & 1: + node_req_handoffs[rpc['node']].append(delay) + else: + node_resp_handoffs[rpc['node']].append(delay) + elif 'queued' in rpc: + delay = rpc['found'] - rpc['queued'] + if id & 1: + node_req_queued[rpc['node']].append(delay) + else: + node_resp_queued[rpc['node']].append(delay) + + print('\n------------------') + print('Analyzer: handoffs') + print('------------------') + print('') + print('Delays in handing off RPCs to an application thread (elapsed ') + print('time from when homa_rpc_handoff was called at SoftIRQ level ') + print('until homa_wait_for_message received the RPC in the application):') + print('Node: Name of node') + print('FastFrac: Fraction of messages that were handed directly to') + print(' a waiting thread (no queueing)') + print('FAvg: Average delay for fast handoffs') + print('FP50: Median delay for fast handoffs') + print('FP90: 90th percentile delay for fast handoffs') + print('FP99: 99th percentile delay for fast handoffs') + print('QAvg: Average delay for handoffs where the message had') + print(' to be queued (no waiting thread)') + print('QP50: Median delay for queued handoffs') + print('QP90: 90th percentile delay for queued handoffs') + print('QP99: 99th percentile delay for queued handoffs') + print('') + + for i in [0, 1]: + if i == 0: + print("\nRequest messages:") + else: + print("\nResponse messages:") + print('Node FastFrac Favg FP50 FP90 FP99 QAvg ' + 'QP50 QP90 QP99') + print('----------------------------------------------------' + '------------------') + + for node in get_sorted_nodes(): + if i == 0: + handoffs = sorted(node_req_handoffs[node]) + queued = sorted(node_req_queued[node]) + else: + handoffs = sorted(node_resp_handoffs[node]) + queued = sorted(node_resp_queued[node]) + + print('%-10s %5.3f' % (node, + len(handoffs)/(len(handoffs) + len(queued))), + end='') + if handoffs: + print(' %5.1f %5.1f %6.1f %6.1f' % ( + sum(handoffs)/len(handoffs), + handoffs[(50*len(handoffs))//100], + handoffs[(90*len(handoffs))//100], + handoffs[(99*len(handoffs))//100]), end='') + else: + print(' '*24, end='') + if queued: + print(' %5.1f %5.1f %6.1f %6.1f' % ( + sum(queued)/len(queued), + queued[(50*len(queued))//100], + queued[(90*len(queued))//100], + queued[(99*len(queued))//100])) + else: + print('') + +#------------------------------------------------ +# Analyzer: incoming +#------------------------------------------------ +class AnalyzeIncoming: + """ + Generates detailed timelines of rates of incoming data and packets for + each core of each node. Use the --data option to specify a directory for + data files. + """ + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzePackets') + return + + def write_node_data(self, node, pkts, max): + """ + Write a data file describing incoming traffic to a given node. + + node: Name of the node + pkts: List of tuples describing + packets on that node. + max: Dictionary with values that accumulate information about the + highest throughput seen. + """ + global options + + interval = options.interval + + # Figure out which cores received packets on this node. + cores = {} + for pkt in pkts: + cores[pkt[2]] = 1 + core_ids = sorted(cores.keys()) + + pkts.sort(key=lambda t : t[0]) + start = pkts[0][0] + interval_end = (start//interval) * interval + if interval_end > start: + interval_end -= interval + + core_bytes = {} + core_pkts = {} + min_prio = 100 + + f = open('%s/incoming_%s.dat' % (options.data, node), 'w') + f.write('# Node: %s\n' % (node)) + f.write('# Generated at %s.\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + f.write('# Rate of arrival of incoming data and packets, broken down\n') + f.write('# by core and time interval:\n') + f.write('# Time: End of the time interval\n') + f.write('# GbpsN: Data arrival rate on core N for the ' + 'interval (Gbps)\n') + f.write('# PktsN: Total packets (grants and data) that arrived on ' + 'core N in the interval\n') + f.write('# Gbps: Total arrival rate of data across all ' + 'cores (Gbps)\n') + f.write('# Pkts: Total packet arrivals (grants and data) across all ' + 'cores\n') + f.write('# MinP: Lowest priority level for any incoming packet\n') + f.write('\nInterval') + for c in core_ids: + f.write(' %6s %6s' % ('Gps%d' % c, 'Pkts%d' % c)) + f.write(' Gbps Pkts MinP\n') + + for t, length, core, priority in pkts: + while t >= interval_end: + if interval_end > start: + f.write('%8.1f' % (interval_end)) + total_gbps = 0 + total_pkts = 0 + for c in core_ids: + gbps = 8*core_bytes[c]/(interval*1e03) + f.write(' %6.1f %6d' % (gbps, core_pkts[c])) + total_gbps += gbps + total_pkts += core_pkts[c] + if core_pkts[c] > max['core_pkts']: + max['core_pkts'] = core_pkts[c] + max['core_pkts_time'] = interval_end + max['core_pkts_core'] = '%s, core %d' % (node, c) + if gbps > max['core_gbps']: + max['core_gbps'] = gbps + max['core_gbps_time'] = interval_end + max['core_gbps_core'] = '%s, core %d' % (node, c) + f.write(' %5.1f %5d %3d\n' % (total_gbps, + total_pkts, min_prio)) + if total_pkts > max['node_pkts']: + max['node_pkts'] = total_pkts + max['node_pkts_time'] = interval_end + max['node_pkts_node'] = node + if total_gbps > max['node_gbps']: + max['node_gbps'] = total_gbps + max['node_gbps_time'] = interval_end + max['node_gbps_node'] = node + for c in core_ids: + core_bytes[c] = 0 + core_pkts[c] = 0 + min_prio = 7 + interval_end += 20 + core_pkts[core] += 1 + core_bytes[core] += length + if priority < min_prio: + min_prio = priority + f.close() + + def output(self): + global packets, grants, options, rpcs + + # Node name -> list of packets for that node. Each packet is described + # by a tuple giving the arrival time and size of + # the packet (size 0 means the packet was a grant) and the core where + #it was received. + nodes = defaultdict(list) + + if options.data == None: + print('The incoming analyzer can\'t do anything without the ' + '--data option') + return + + skipped = 0 + total_pkts = 0 + for pkt in packets.values(): + if not 'gro' in pkt: + continue + length = get_recv_length(pkt['offset'], pkt['msg_length']) + if not 'id' in pkt: + print('Packet: %s' % (pkt)) + rpc = rpcs[pkt['id']^1] + nodes[rpc['node']].append([pkt['gro'], length, rpc['gro_core'], + pkt['priority']]) + total_pkts += 1 + if skipped > 0: + print('Incoming analyzer skipped %d packets out of %d (%.2f%%): ' + 'couldn\'t compute length' % (skipped, total_pkts, + 100.0*(skipped//total_pkts)), file=sys.stderr) + + for grant in grants.values(): + if not 'gro' in grant: + continue + rpc = rpcs[grant['id']^1] + nodes[rpc['node']].append([grant['gro'], 0, rpc['gro_core'], 7]) + + print('\n-------------------') + print('Analyzer: incoming') + print('-------------------') + if options.data == None: + print('No --data option specified, data can\'t be written.') + + max = { + 'core_pkts': 0, 'core_pkts_time': 0, 'core_pkts_core': 0, + 'core_gbps': 0, 'core_gbps_time': 0, 'core_gbps_core': 0, + 'node_pkts': 0, 'node_pkts_time': 0, 'node_pkts_node': 0, + 'node_gbps': 0, 'node_gbps_time': 0, 'node_gbps_node': 0 + } + for node, node_pkts in nodes.items(): + self.write_node_data(node, node_pkts, max) + print('Maximum homa_gro_receive throughputs in a 20 usec interval:') + print(' Packets per core: %4d (time %7.1f, %s)' % (max['core_pkts'], + max['core_pkts_time'], max['core_pkts_core'])) + print(' Gbps per core: %5.1f (time %7.1f, %s)' % (max['core_gbps'], + max['core_gbps_time'], max['core_gbps_core'])) + print(' Packets per node: %4d (time %7.1f, %s)' % (max['node_pkts'], + max['node_pkts_time'], max['node_pkts_node'])) + print(' Gbps per node: %5.1f (time %7.1f, %s)' % (max['node_gbps'], + max['node_gbps_time'], max['node_gbps_node'])) + +#------------------------------------------------ +# Analyzer: intervals +#------------------------------------------------ +class AnalyzeIntervals: + """ + Populates the intervals global variable but doesn't actually print + anything. Generates information that is used by other analyzers. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzePackets') + self.tx_qid = None + + # Node name -> list of pairs, where time gives the + # time when a packet was handed off to the NIC and length gives + # the total length of the packet in bytes. + self.tcp_xmits = defaultdict(list) + return + + def tt_tcp_xmit(self, trace, t, core, source, dest, data_bytes, + seq_ack): + self.tcp_xmits[trace['node']].append([t, data_bytes]) + + def restrict_qid(self, qid): + """ + Ignore all packets except thost that use the given transmit queue. + """ + self.tx_qid = qid + + def init_intervals(self): + global intervals, options + + # Initialize the intervals structure + intervals = {} + interval_length = options.interval + for node in traces.keys(): + node_intervals = [] + t = get_first_interval_end(node) + end = traces[node]['last_time'] + interval_length + while t < end: + interval = defaultdict(lambda: 0) + interval['time'] = t + interval['tx_max_gro_free'] = None + interval['tx_min_gro_free'] = None + interval['rx_grant_info'] = None + interval['tx_grant_info'] = None + node_intervals.append(interval) + t += interval_length + intervals[node] = node_intervals + + def add_grant_info(self, rpc): + """ + Analyzes incoming grants and outgoing packets for rpc and adds + tx_grant_avl and rx_granted information to intervals. + """ + + # List of tuples, where event is one of: + # grant_xmit: time and offset describe a grant passed to ip*xmit + # by receiver + # grant_softirq: time and offset describe a grant processed by + # SoftIRQ on sender. + # data: time and offset describe a data packet passed to + # ip*xmit by sender (offset is the byte just after the + # last one in the packet). + global rpcs + + events = [] + id = rpc['id'] + grant_xmit_offset = 0 + grant_softirq_offset = 0 + data_offset = 1e20 + prev_time = 0 + if id^1 in rpcs: + tx_rpc = rpcs[id^1] + tx_node = tx_rpc['node'] + for t, offset in tx_rpc['softirq_grant']: + events.append([t, 'grant_softirq', offset]) + for pkt in tx_rpc['send_data_pkts']: + if 'xmit2' in pkt and 'tso_length' in pkt: + offset = pkt['offset'] + events.append([pkt['xmit2'], 'data', + offset + pkt['tso_length']]) + if offset < data_offset: + # Computes initial data_offset, for cases where initial + # data packets aren't in the trace + data_offset = offset + + node = rpc['node'] + for t, offset, prio, increment in rpc['send_grant']: + events.append([t, 'grant_xmit', offset]) + if not events: + return + + for t, event, offset in sorted(events, key=lambda t : t[0]): + if grant_xmit_offset > data_offset: + add_to_intervals(node, prev_time, t, 'rx_granted', + grant_xmit_offset - data_offset) + if grant_softirq_offset > data_offset: + add_to_intervals(tx_node, prev_time, t, 'tx_grant_avl', + grant_softirq_offset - data_offset) + if event == 'grant_xmit': + if offset > grant_xmit_offset: + grant_xmit_offset = offset + elif event == 'grant_softirq': + if offset > grant_softirq_offset: + grant_softirq_offset = offset + elif event == 'data': + if offset > data_offset: + data_offset = offset + else: + raise Exception('Unknown event type %s in add_grant_info' % ( + event)) + prev_time = t + + def qlen(self, prev, elapsed): + """ + Compute the new length of the NIC queue + prev: Previous length of the queue + elapsed: Amount of time that has passed with no new transmissions + added to the queue + """ + global options + xmit_bytes = ((elapsed) * (1000.0*options.gbps/8)) + if xmit_bytes < prev: + new_length = prev - xmit_bytes + else: + new_length = 0 + return new_length + + def analyze(self): + """ + Fill in fields of intervals related to incoming messages. + """ + + global rpcs, packets, grants, max_unsched, traces, options, intervals + + # Node name -> list of pairs, where time gives the + # time when a packet was handed off to the NIC (or passed to ip*xmit) + # and length gives the total length of the packet in bytes. + node_xmits = defaultdict(list) + for node, xmits in self.tcp_xmits.items(): + node_xmits[node].extend(xmits) + + # Total number of bytes a grant packet occupies on the wire, including + # headers, inter-packet gap, etc. + grant_bytes = 34 + 20 + 42 + + # Total number of bytes a data packet occupies on the wire, including + # everything *but* the actual data (headers, inter-packet gap, etc.) + data_overhead_bytes = 60 + 20 + 42 + + self.init_intervals() + late_usecs = options.late + + qid = options.tx_qid if options.tx_qid != None else 0 + + # See if packets include NIC xmit times + nic_data = False + for pkt in itertools.chain(packets.values(), tcp_packets.values()): + if ('xmit' in pkt) and ('gro' in pkt): + if 'nic' in pkt: + nic_data = True + break + + # Extract information from packets + for pkt in itertools.chain(packets.values(), tcp_packets.values()): + if (self.tx_qid != None) and ((not 'tx_qid' in pkt) + or (pkt['tx_qid'] != self.tx_qid)): + continue + tx_node = pkt['tx_node'] + if not 'length' in pkt: + print('Packet with no length: %s' % (pkt), file=sys.stderr) + continue + length = pkt['length'] + txmit = pkt['xmit2'] if 'xmit2' in pkt else None + if 'nic' in pkt: + tnic = pkt['nic'] + nic_start = tnic + nic_interval = get_interval(tx_node, tnic) + else: + tnic = None + if tx_node: + if not tx_node in traces: + print('Bogus node name %s. Packet: %s' % (tx_node, pkt)) + print('\nTraces: %s' % (traces)) + nic_start = traces[tx_node]['first_time'] + if 'free_tx_skb' in pkt: + tfree = pkt['free_tx_skb'] + nic_end = tfree + nic_end2 = tfree + else: + tfree = None + nic_end = 1e20 + nic_end2 = 1e20 + if 'gro' in pkt: + tgro = pkt['gro'] + if tgro < nic_end2: + nic_end2 = tgro + else: + tgro = None + + # For most tx statistics, process only the original TSO frame, + # not the generated segments + if ('tso_length' in pkt): + tso_length = pkt['tso_length'] + + if tx_node: + if nic_end < 1e20: + add_to_intervals(tx_node, nic_start, nic_end, + 'tx_in_nic', tso_length) + if nic_end2 < 1e20: + add_to_intervals(tx_node, nic_start, nic_end2, + 'tx_in_nic2', tso_length) + add_to_intervals(tx_node, nic_start, nic_end2, + 'pkts_in_nic2', 1) + if 'tx_qid' in pkt and pkt['tx_qid'] == qid: + add_to_intervals(tx_node, nic_start, nic_end2, + 'tx_in_nic_qx', tso_length) + + if txmit != None: + interval = get_interval(tx_node, txmit) + interval['tx_pkts'] += 1 + interval['tx_bytes'] += tso_length + if tnic != None: + add_to_intervals(tx_node, txmit, tnic, + 'tx_qdisc', tso_length) + + if tnic != None: + node_xmits[tx_node].append([tnic, + tso_length + data_overhead_bytes]) + nic_interval['tx_nic_pkts'] += 1 + nic_interval['tx_nic_bytes'] += tso_length + elif txmit != None: + node_xmits[tx_node].append([txmit, + tso_length + data_overhead_bytes]) + + if tfree != None: + interval = get_interval(tx_node, tfree) + interval['tx_free_bytes'] += tso_length + if tnic != None: + delay = tfree - tnic + if delay > nic_interval['tx_max_free']: + nic_interval['tx_max_free'] = delay + if (nic_interval['tx_min_free'] == 0) or (delay < + nic_interval['tx_min_free']): + nic_interval['tx_min_free'] = delay + if tgro != None: + add_to_intervals(tx_node, tgro, tfree, 'tx_nic_rx', + tso_length) + + if tgro != None: + interval = get_interval(tx_node, tgro) + if interval != None: + interval['tx_gro_bytes'] += length + + if not pkt['rx_node']: + continue + rx_node = pkt['rx_node'] + if tnic != None: + if txmit != None: + add_to_intervals(rx_node, txmit, tnic, 'rx_data_qdisc', length) + else: + add_to_intervals(rx_node, traces[tx_node]['first_time'], + tnic, 'rx_data_qdisc', length) + elif not nic_data: + tnic = txmit + if tnic != None: + nic_interval = get_interval(tx_node, tnic) + elif txmit != None: + add_to_intervals(rx_node, txmit, traces[tx_node]['last_time'], + 'rx_data_qdisc', length) + + if tgro != None: + interval = get_interval(rx_node, tgro) + interval['rx_pkts'] += 1 + interval['rx_bytes'] += length + if tnic != None: + add_to_intervals(rx_node, tnic, tgro, 'rx_data_net', length) + delay = tgro - tnic + if delay > late_usecs: + add_to_intervals(rx_node, tnic+late_usecs, tgro, + 'rx_overdue', length) + + if (tgro != None) and (tfree != None) and (tnic != None): + delay = tgro - tfree + if (nic_interval['tx_max_gro_free'] == None) or (delay > + nic_interval['tx_max_gro_free']): + nic_interval['tx_max_gro_free'] = delay + if (nic_interval['tx_min_gro_free'] == None) or (delay < + nic_interval['tx_min_gro_free']): + nic_interval['tx_min_gro_free'] = delay + + if 'softirq' in pkt: + tsoftirq = pkt['softirq'] + if tgro != None: + add_to_intervals(rx_node, tgro, tsoftirq, 'rx_data_gro', length) + else: + add_to_intervals(rx_node, traces[rx_node]['first_time'], + tsoftirq, 'rx_data_gro', length) + elif tgro != None and pkt['type'] == 'homa': + # Note: TCP doesn't yet provide softirq times, hence the + # exclusion above. + add_to_intervals(rx_node, tgro, traces[rx_node]['last_time'], + 'rx_data_gro', length) + + # Extract information from grants + for grant in grants.values(): + offset = grant['offset'] + increment = grant['increment'] + rx_id = grant['id']^1 + if not rx_id in rpcs: + continue + + if 'xmit' in grant: + txmit = grant['xmit'] + get_interval(grant['tx_node'], txmit)['rx_new_grants'] += increment + + # rx_* refers to the RPC that received the grant and tx'ed data + rx_rpc = rpcs[rx_id] + rx_node = rx_rpc['node'] + if 'gro' in grant: + tgro = grant['gro'] + if 'xmit' in grant: + add_to_intervals(rx_node, txmit, tgro, 'tx_grant_xmit', + increment) + else: + tgro = None + + if 'softirq' in grant: + tsoftirq = grant['softirq'] + get_interval(rx_node, tsoftirq)['tx_new_grants'] += increment + if tgro != None: + add_to_intervals(rx_node, tgro, tsoftirq, 'tx_grant_gro', + increment) + elif tgro != None: + add_to_intervals(rx_node, tgro, traces[rx_node]['last_time'], + 'tx_grant_gro', increment) + + if 'nic' in grant: + node_xmits[grant['tx_node']].append([grant['nic'], grant_bytes]) + elif 'xmit' in grant: + node_xmits[grant['tx_node']].append([txmit, grant_bytes]) + + # Extract information from RPCs + for id, rpc in rpcs.items(): + node = rpc['node'] + if not id & 1: + # rpcs_live + start = None + if 'sendmsg' in rpc: + start = rpc['sendmsg'] + if 'recvmsg_done' in rpc: + end = rpc['recvmsg_done'] + else: + end = traces[node]['last_time'] + elif 'recvmsg_done' in rpc: + start = traces[node]['first_time'] + end = rpc['recvmsg_done'] + elif ('remaining' in rpc) or ('sent' in rpc): + start = traces[node]['first_time'] + end = traces[node]['last_time'] + if start != None: + add_to_intervals(node, start, end, 'rpcs_live', 1) + + # tx_live, rx_live + if 'tx_live' in rpc: + start, end = rpc['tx_live'] + if id & 1: + add_to_intervals(node, start, end, 'tx_live_resp', 1) + else: + add_to_intervals(node, start, end, 'tx_live_req', 1) + if 'rx_live' in rpc: + start, end = rpc['rx_live'] + add_to_intervals(node, start, end, 'rx_live', 1) + + # tx_starts + if 'sendmsg' in rpc: + get_interval(node, rpc['sendmsg'])['tx_starts'] += 1 + + # rx_starts + if rpc['gro_data']: + t, offset, prio = rpc['gro_data'][0] + if offset == 0: + get_interval(node, t)['rx_starts'] += 1 + + # tx_grant_avl and rx_granted + self.add_grant_info(rpc) + + # rx_grantable + in_length = rpc['in_length'] + if rpc['send_grant'] or (('unsched' in rpc) and (in_length != None) + and (in_length > rpc['unsched']) or (('granted' in rpc) + and (in_length != None) and (rpc['granted'] < in_length))): + start = traces[rpc['node']]['first_time'] + if rpc['softirq_data_pkts']: + start = rpc['softirq_data_pkts'][0]['softirq'] + + end = traces[rpc['node']]['last_time'] + if rpc['send_grant_pkts']: + last_grant = rpc['send_grant_pkts'][-1] + if last_grant['offset'] >= rpc['in_length']: + end = last_grant['xmit'] + add_to_intervals(node, start, end, 'rx_grantable', 1) + + # Compute NIC queue lengths + for node, xmits in node_xmits.items(): + if not xmits: + continue + xmits.sort(key=lambda t : t[0]) + cur_queue = 0 + prev_time = min(xmits[0][0], intervals[node][0]['time']) + i = 0 + t, bytes = xmits[i] + for interval in intervals[node]: + itime = interval['time'] + while itime >= t: + cur_queue = self.qlen(cur_queue, t - prev_time) + bytes + prev_time = t + i += 1 + if i >= len(xmits): + t = 1e20 + else: + t, bytes = xmits[i] + cur_queue = self.qlen(cur_queue, itime - prev_time) + prev_time = itime + interval['tx_q'] = cur_queue + +#------------------------------------------------ +# Analyzer: longterm +#------------------------------------------------ +class AnalyzeLongterm: + """ + Uses data recorded by homa_rpc_snapshot_log_tt to analyze statistics on + RPC progress for each node over a much longer time period than covered by + the traces themselves. Generates data about active messages as well as + arrival and service rates, with separate statistics for client vs. server + RPCs and requests vs. responses. This analyzer will not work unless + homa_rpc_snapshot_log_tt was invoked before freezing the timetraces. + If --data is specified then more detailed node-specific files are generated + in the data directory. + """ + + def __init__(self, dispatcher): + # Node name -> list of records for that node. Each record has + # the following fields: + # time: Time when the record was generated. + # creq_start: The client_requests_started Homa metric + # creq_kbstart: The client_request_bytes_started Homa metric, + # except units are KB, not bytes + # creq_kbdone: The client_request_bytes_done Homa metric, + # except units are KB, not bytes + # creq_done: The client_requests_done Homa metric + # cresp_start: The lient_responses_started Homa metric + # cresp_kbstart: The client_response_bytes_started Homa metric, + # except units are KB, not bytes + # cresp_kbdone: The client_response_bytes_done Homa metric, + # except units are KB, not bytes + # cresp_done: The client_responses_done Homa metric + # sreq_start: The server_requests_started Homa metric + # sreq_kbstart: The server_request_bytes_started Homa metric, + # except units are KB, not bytes + # sreq_kbdone: The server_request_bytes_done Homa metric, + # except units are KB, not bytes + # sreq_done: The server_requests_done Homa metric + # sresp_start: The server_responses_started Homa metric + # sresp_kbstart The erver_response_bytes_started Homa metric, + # except units are KB, not bytes + # sresp_kbdone: The server_response_bytes_done Homa metric, + # except units are KB, not bytes + # sresp_done: The erver_responses_done Homa metric + self.node_records = defaultdict(list) + + # A list with one entry for each interval of backlog data (not the + # same intervals as the global variable "intervals"). Each entry + # is a list with two values: + # time: The time that the interval represents + # indexes: A list with one entry for each element in + # get_sorted_nodes, which is the index of the first + # element in node_records whose time is at or after + # time, or -1 if there is no such entry or if the + # index would be zero (so there is no preceding entry) + self.intervals = [] + + # Elepased time between elements of self.intervals + self.interval = None + + def init_trace(self, trace): + # Time of the first snapshot record encountered for this node; + # serves as a reference point for time values in the records. + self.ref_time = None + + def tt_snapshot_clock(self, trace, t, core, usecs): + if self.ref_time == None: + self.ref_time = t + records = self.node_records[trace['node']] + if len(records) > 0 and (not 'creq_start' in records[-1] + or not 'cresp_start' in records[-1] + or not 'sreq_start' in records[-1] + or not 'sresp_start' in records[-1]): + # Previous record was incomplete, so just remove it. + print('Removing incomplete snapshot record for node %s at ' + 'usecs %d' % (trace['node'], usecs)) + del records[-1] + records.append({'time': self.ref_time + usecs}) + + def tt_snapshot_client_request(self, trace, t, core, msgs_started, + bytes_started, bytes_done, msgs_done): + records = self.node_records[trace['node']] + if records: + record = records[-1] + if 'time' in record and not 'creq_start' in record: + record['creq_start'] = msgs_started + record['creq_kbstart'] = bytes_started + record['creq_kbdone'] = bytes_done + record['creq_done'] = msgs_done + + def tt_snapshot_client_response(self, trace, t, core, msgs_started, + bytes_started, bytes_done, msgs_done): + records = self.node_records[trace['node']] + if records: + record = records[-1] + if ('creq_start' in record and + not 'cresp_start' in record): + record['cresp_start'] = msgs_started + record['cresp_kbstart'] = bytes_started + record['cresp_kbdone'] = bytes_done + record['cresp_done'] = msgs_done + + def tt_snapshot_server_request(self, trace, t, core, msgs_started, + bytes_started, bytes_done, msgs_done): + records = self.node_records[trace['node']] + if records: + record = records[-1] + if ('cresp_start' in record and + not 'sreq_start' in record): + record['sreq_start'] = msgs_started + record['sreq_kbstart'] = bytes_started + record['sreq_kbdone'] = bytes_done + record['sreq_done'] = msgs_done + + def tt_snapshot_server_response(self, trace, t, core, msgs_started, + bytes_started, bytes_done, msgs_done): + records = self.node_records[trace['node']] + if records: + record = records[-1] + if ('sreq_start' in record and + not 'sresp_start' in record): + record['sresp_start'] = msgs_started + record['sresp_kbstart'] = bytes_started + record['sresp_kbdone'] = bytes_done + record['sresp_done'] = msgs_done + + def analyze(self): + """ + Determines the length of the intervals in the data and returns a + list with one entry for each interval. Each entry is a list with + two values: + time: The time that the interval represents + indexes: A list with one entry for each element in + get_sorted_nodes, which is the index of the first + element in node_records whose time is at or after + time, or -1 if there is no such entry or if the + index would be zero (so there is no preceding entry) + """ + + nodes = get_sorted_nodes() + start = 1e20 + end = -1e20 + interval = None + for node in nodes: + records = self.node_records[node] + if records[0]['time'] < start: + start = records[0]['time'] + if records[-1]['time'] > end: + end = records[-1]['time'] + + # Figure out the interval for records on this node (round to + # an integer that is all zeroes except the high-order digit) + tend = records[-1]['time'] + tstart = records[0]['time'] + node_interval = (tend - tstart) / (len(records) - 1) + node_interval = int(float('%.0g' % (node_interval))) + if interval == None: + interval = node_interval + elif interval != node_interval: + print('%s has a different interval for rx backlog records than %s (%d vs %d)' % + (node, nodes[0], node_interval, interval), file=sys.stderr) + + start = int(start) // interval * interval + + # Each iteration of the following loop generates one list of indexes + # for the result. + next = [1] * len(nodes) + self.intervals = [] + for t in itertools.count(start, interval): + if t > end: + break + indices = [] + for i in range(0, len(nodes)): + records = self.node_records[nodes[i]] + if records[0]['time'] >= t or records[-1]['time'] < t: + indices.append(-1) + continue + while records[next[i]]['time'] < t: + next[i] += 1 + indices.append(next[i]) + # print('Index %d for %s has interval %d, time %d, usecs %d' % ( + # next[i], nodes[i], t, records[next[i]]['time'], + # records[next[i]]['usecs'])) + self.intervals.append([t, indices]) + + self.interval = interval + + def output_node_client_data(self, node, node_index): + """ + Generates a node-specific data file with time series data about + client RPCs issued by that node. + node: Name of node for which to print data + node_index: Index of info for this node in various arrays + """ + + f = open('%s/longterm_client_%s.dat' % (options.data, node), 'w') + f.write('# Node: %s\n' % (node)) + f.write('# Generated at %s.\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + f.write('# Interval-based statistics about outgoing RPCs issued by ' + '%s\n' % (node)) + f.write('# Time: Time in seconds. The actual interval for the ' + 'data spans this\n') + f.write('# time and its length is approximately the same ' + 'as the time between\n') + f.write('# consecutive lines, but its end time could be ' + 'anywhere from the\n') + f.write('# given time up to the next time\n') + f.write('# ActvReq: Number of active request messages as of this interval\n') + f.write('# ReqMB: Pending request data as of this interval ' + '(untransmitted data in\n') + f.write('# active messages, Mbytes)\n') + f.write('# ReqStart: Rate at which new requests started in the ' + 'interval (K/sec)\n') + f.write('# ReqDStart: Total data in new requests that started in the ' + 'interval,\n') + f.write('# expressed as a rate (Gbps)\n') + f.write('# ReqDDone Rate at which request data was transmitted in the ' + 'interval (Gbps)\n') + f.write('# ReqDone: Rate at which request messages completed in the ' + 'interval (K/sec)\n') + f.write('# ActvResp: Number of active response messages as of this interval\n') + f.write('# RspMB: Unreceived response data as of this interval (MB)\n') + f.write('# RspStart: Rate at which new responses started in the ' + 'interval (K/sec)\n') + f.write('# RspDStart: Total data in new responses that started in the ' + 'interval,\n') + f.write('# expressed as a rate (Gbps)\n') + f.write('# RspDDone Rate at which response data was received in the ' + 'interval (Gbps)\n') + f.write('# RspDone: Rate at which response messages completed in the ' + 'interval (K/sec)\n') + f.write('\n') + f.write('# Time ActvReq ReqMB ReqStart ReqDStart ReqDDone ReqDone') + f.write(' ActvRsp RspMB RspStart RspDStart RspDDone RspDone\n') + + records = self.node_records[node] + for interval in self.intervals: + t = interval[0] + record_index = interval[1][node_index] + if record_index < 0: + continue + cur = records[record_index] + prev = records[record_index - 1] + elapsed_secs = 1e-6 * (cur['time'] - prev['time']) + mpending = cur['creq_start'] - cur['creq_done'] + kbpending = cur['creq_kbstart'] - cur['creq_kbdone'] + mstarts = cur['creq_start'] - prev['creq_start'] + kbstarts = cur['creq_kbstart'] - prev['creq_kbstart'] + kbdone = cur['creq_kbdone'] - prev['creq_kbdone'] + mdone = cur['creq_done'] - prev['creq_done'] + f.write('%10.3f %7d %6.2f %8.2f %9.2f %8.2f %7.2f' % (1e-06 * t, + mpending, 1e-3 * kbpending, 1e-3 * (mstarts / elapsed_secs), + 8e-6 * (kbstarts / elapsed_secs), + 8e-6 * (kbdone / elapsed_secs), + 1e-3 * (mdone / elapsed_secs) + )) + mpending = cur['cresp_start'] - cur['cresp_done'] + kbpending = cur['cresp_kbstart'] - cur['cresp_kbdone'] + mstarts = cur['cresp_start'] - prev['cresp_start'] + kbstarts = cur['cresp_kbstart'] - prev['cresp_kbstart'] + kbdone = cur['cresp_kbdone'] - prev['cresp_kbdone'] + mdone = cur['cresp_done'] - prev['cresp_done'] + f.write(' %7d %6.2f %8.2f %9.2f %8.2f %7.2f\n' % ( + mpending, 1e-3 * kbpending, 1e-3 * (mstarts / elapsed_secs), + 8e-6 * (kbstarts / elapsed_secs), + 8e-6 * (kbdone / elapsed_secs), + 1e-3 * (mdone / elapsed_secs) + )) + + def output_node_server_data(self, node, node_index): + """ + Generates a node-specific data file with time series data about + server RPCs handled by that node. + node: Name of node for which to print data + node_index: Index of info for this node in various arrays + """ + + f = open('%s/longterm_server_%s.dat' % (options.data, node), 'w') + f.write('# Node: %s\n' % (node)) + f.write('# Generated at %s.\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + f.write('# Interval-based statistics about incoming RPCs served by ' + '%s\n' % (node)) + f.write('# Time: Time in seconds. The actual interval for the ' + 'data spans this\n') + f.write('# time and its length is approximately the same ' + 'as the time between\n') + f.write('# consecutive lines, but its end time could be ' + 'anywhere from the\n') + f.write('# given time up to the next time\n') + f.write('# ActvReq: Number of active request messages as of this interval\n') + f.write('# ReqMB: Pending request data as of this interval ' + '(unreceived data in\n') + f.write('# active messages, Mbytes)\n') + f.write('# ReqStart: Rate at which new requests started in the ' + 'interval (K/sec)\n') + f.write('# ReqDStart: Total data in new requests that started in the ' + 'interval,\n') + f.write('# expressed as a rate (Gbps)\n') + f.write('# ReqDDone Rate at which request data was received in the ' + 'interval (Gbps)\n') + f.write('# ReqDone: Rate at which request messages completed in the ' + 'interval (K/sec)\n') + f.write('# ActvResp: Number of active response messages as of this interval\n') + f.write('# RspMB: Untransmitted response data as of this interval (MB)\n') + f.write('# RspStart: Rate at which new responses started in the ' + 'interval (K/sec)\n') + f.write('# RspDStart: Total data in new responses that started in the ' + 'interval,\n') + f.write('# expressed as a rate (Gbps)\n') + f.write('# RspDDone Rate at which response data was transmitted in the ' + 'interval (Gbps)\n') + f.write('# RspDone: Rate at which response messages completed in the ' + 'interval (K/sec)\n') + f.write('\n') + f.write('# Time ActvReq ReqMB ReqStart ReqDStart ReqDDone ReqDone') + f.write(' ActvRsp RspMB RspStart RspDStart RspDDone RspDone\n') + + records = self.node_records[node] + for interval in self.intervals: + t = interval[0] + record_index = interval[1][node_index] + if record_index < 0: + continue + cur = records[record_index] + prev = records[record_index - 1] + elapsed_secs = 1e-6 * (cur['time'] - prev['time']) + mpending = cur['sreq_start'] - cur['sreq_done'] + kbpending = cur['sreq_kbstart'] - cur['sreq_kbdone'] + mstarts = cur['sreq_start'] - prev['sreq_start'] + kbstarts = cur['sreq_kbstart'] - prev['sreq_kbstart'] + kbdone = cur['sreq_kbdone'] - prev['sreq_kbdone'] + mdone = cur['sreq_done'] - prev['sreq_done'] + f.write('%10.3f %7d %6.2f %8.2f %9.2f %8.2f %7.2f' % (1e-06 * t, + mpending, 1e-3 * kbpending, 1e-3 * (mstarts / elapsed_secs), + 8e-6 * (kbstarts / elapsed_secs), + 8e-6 * (kbdone / elapsed_secs), + 1e-3 * (mdone / elapsed_secs) + )) + mpending = cur['sresp_start'] - cur['sresp_done'] + kbpending = cur['sresp_kbstart'] - cur['sresp_kbdone'] + mstarts = cur['sresp_start'] - prev['sresp_start'] + kbstarts = cur['sresp_kbstart'] - prev['sresp_kbstart'] + kbdone = cur['sresp_kbdone'] - prev['sresp_kbdone'] + mdone = cur['sresp_done'] - prev['sresp_done'] + f.write(' %7d %6.2f %8.2f %9.2f %8.2f %7.2f\n' % ( + mpending, 1e-3 * kbpending, 1e-3 * (mstarts / elapsed_secs), + 8e-6 * (kbstarts / elapsed_secs), + 8e-6 * (kbdone / elapsed_secs), + 1e-3 * (mdone / elapsed_secs) + )) + + def output(self): + print('\n--------------------') + print('Analyzer: longterm') + print('--------------------\n') + + nodes = get_sorted_nodes() + + print('# Activity for client requests issued by each node over the ' + 'last 2 seconds:') + print('# Node: Name of node') + print('# Active: Number of active request messages at the end ' + 'of the traces') + print('# PendMB: Pending (untransmitted) data in active messages ' + 'at the') + print('# end of the traces (Mbytes)') + print('# MStart: Average rate at which new request messages ' + 'started (K/sec)') + print('# DStart: Total data in new requests that started, ' + 'expressed as a rate') + print('# (Gbps)') + print('# DDone Average rate at which request data was ' + 'transmitted (Gbps)') + print('# MDone: Average rate at which request messages ' + 'completed (K/sec)') + print('\n# Node Active PendMB MStart DStart DDone MDone') + + for node in nodes: + records = self.node_records[node] + cur = records[-1] + prev_index = len(records) - 1 - int(2.0 / (self.interval / 1e6)) + if prev_index < 0: + prev_index = 0 + prev = records[prev_index] + elapsed_secs = 1e-6 * (cur['time'] - prev['time']) + mpending = cur['creq_start'] - cur['creq_done'] + kbpending = cur['creq_kbstart'] - cur['creq_kbdone'] + mstarts = cur['creq_start'] - prev['creq_start'] + kbstarts = cur['creq_kbstart'] - prev['creq_kbstart'] + kbdone = cur['creq_kbdone'] - prev['creq_kbdone'] + mdone = cur['creq_done'] - prev['creq_done'] + print('%-10s %7d %7.2f %7.2f %7.2f %7.2f %7.2f' % (node, + mpending, 1e-3 * kbpending, 1e-3 * (mstarts / elapsed_secs), + 8e-6 * (kbstarts / elapsed_secs), + 8e-6 * (kbdone / elapsed_secs), + 1e-3 * (mdone / elapsed_secs) + )) + + print() + print('# Activity for client responses received by each node ' + 'over the last 2 seconds:') + print('# Node: Name of node') + print('# Active: Number of active response messages as of the end ' + 'of the traces') + print('# PendMB: Pending (unreceived) data in active messages ' + 'at the') + print('# end of the traces (Mbytes)') + print('# MStart: Average rate at which new response messages started (K/sec)') + print('# DStart: Total data in new responses that started, ' + 'expressed as a rate') + print('# (Gbps)') + print('# DDone Average rate at which response data was received (Gbps)') + print('# MDone: Average rate at which response messages completed (K/sec)') + print('\n# Node Active PendMB MStart DStart DDone MDone') + + for node in nodes: + records = self.node_records[node] + cur = records[-1] + prev_index = len(records) - 1 - int(2.0 / (self.interval / 1e6)) + if prev_index < 0: + prev_index = 0 + prev = records[prev_index] + elapsed_secs = 1e-6 * (cur['time'] - prev['time']) + mpending = cur['cresp_start'] - cur['cresp_done'] + kbpending = cur['cresp_kbstart'] - cur['cresp_kbdone'] + mstarts = cur['cresp_start'] - prev['cresp_start'] + kbstarts = cur['cresp_kbstart'] - prev['cresp_kbstart'] + kbdone = cur['cresp_kbdone'] - prev['cresp_kbdone'] + mdone = cur['cresp_done'] - prev['cresp_done'] + print('%-10s %7d %7.2f %7.2f %7.2f %7.2f %7.2f' % (node, + mpending, 1e-3 * kbpending, 1e-3 * (mstarts / elapsed_secs), + 8e-6 * (kbstarts / elapsed_secs), + 8e-6 * (kbdone / elapsed_secs), + 1e-3 * (mdone / elapsed_secs) + )) + + print('\n# Activity for server requests received by each node over ' + 'the last 2 seconds') + print('# (Columns are the same as for client responses)') + print('\n# Node Active PendMB MStart DStart DDone MDone') + + for node in nodes: + records = self.node_records[node] + cur = records[-1] + prev_index = len(records) - 1 - int(2.0 / (self.interval / 1e6)) + if prev_index < 0: + prev_index = 0 + prev = records[prev_index] + elapsed_secs = 1e-6 * (cur['time'] - prev['time']) + mpending = cur['sreq_start'] - cur['sreq_done'] + kbpending = cur['sreq_kbstart'] - cur['sreq_kbdone'] + mstarts = cur['sreq_start'] - prev['sreq_start'] + kbstarts = cur['sreq_kbstart'] - prev['sreq_kbstart'] + kbdone = cur['sreq_kbdone'] - prev['sreq_kbdone'] + mdone = cur['sreq_done'] - prev['sreq_done'] + print('%-10s %7d %7.2f %7.2f %7.2f %7.2f %7.2f' % (node, + mpending, 1e-3 * kbpending, 1e-3 * (mstarts / elapsed_secs), + 8e-6 * (kbstarts / elapsed_secs), + 8e-6 * (kbdone / elapsed_secs), + 1e-3 * (mdone / elapsed_secs) + )) + + print('\n# Activity for server responses transmitted by each node over ' + 'the last 2 seconds') + print('# (Columns are the same as for client requests)') + print('\n# Node Active PendMB MStart DStart DDone MDone') + + for node in nodes: + records = self.node_records[node] + cur = records[-1] + prev_index = len(records) - 1 - int(2.0 / (self.interval / 1e6)) + if prev_index < 0: + prev_index = 0 + prev = records[prev_index] + elapsed_secs = 1e-6 * (cur['time'] - prev['time']) + mpending = cur['sresp_start'] - cur['sresp_done'] + kbpending = cur['sresp_kbstart'] - cur['sresp_kbdone'] + mstarts = cur['sresp_start'] - prev['sresp_start'] + kbstarts = cur['sresp_kbstart'] - prev['sresp_kbstart'] + kbdone = cur['sresp_kbdone'] - prev['sresp_kbdone'] + mdone = cur['sresp_done'] - prev['sresp_done'] + print('%-10s %7d %7.2f %7.2f %7.2f %7.2f %7.2f' % (node, + mpending, 1e-3 * kbpending, 1e-3 * (mstarts / elapsed_secs), + 8e-6 * (kbstarts / elapsed_secs), + 8e-6 * (kbdone / elapsed_secs), + 1e-3 * (mdone / elapsed_secs) + )) + + if options.data: + for i in range(0, len(nodes)): + self.output_node_client_data(nodes[i], i) + self.output_node_server_data(nodes[i], i) + +#------------------------------------------------ +# Analyzer: lost +#------------------------------------------------ +class AnalyzeLost: + """ + Prints information about packets that appear to have been dropped + in the network. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzePackets') + + def analyze(self): + global packets, traces + + # Packets that appear to have been lost. + self.lost_pkts = [] + + # node -> count of lost packets transmitted from that node. + self.tx_lost = defaultdict(lambda : 0) + + # node-> count of lost packets destined for that node. + self.rx_lost = defaultdict(lambda : 0) + + # node -> number of packets retransmitted + self.retransmits = defaultdict(lambda : 0) + + # RPC id -> True for all RPCs with at least one outgoing packet + # either lost or retransmitted. + self.lost_rpcs = {} + + # tx_node -> dict {rx_node -> core where GRO will happen for packets + # send from tx_node to rx_node} + self.rx_core = defaultdict(dict) + + for pkt in packets.values(): + if not 'xmit' in pkt: + continue + rx_node = get_rpc_node(pkt['id']^1) + if rx_node == '': + continue + if 'gro' in pkt: + if not pkt['tx_node']: + print('Strange packet: %s' % (pkt)) + self.rx_core[pkt['tx_node']][rx_node] = pkt['gro_core'] + continue + if (pkt['xmit'] + 200) > traces[rx_node]['last_time']: + continue + if pkt['xmit'] < traces[rx_node]['first_time']: + continue + self.lost_pkts.append(pkt) + self.tx_lost[pkt['tx_node']] += 1 + self.rx_lost[rx_node] += 1 + + for rpc in rpcs.values(): + self.retransmits[rpc['node']] += len(rpc['retransmits']) + + def output(self): + global packets, rpcs, options, traces + + print('\n--------------') + print('Analyzer: lost') + print('--------------') + print('Packets that appear to be lost: %d/%d (%.1f%%)' % (len(self.lost_pkts), + len(packets), 100*len(self.lost_pkts)/len(packets))) + num_retrans = sum(self.retransmits.values()) + print('Retransmitted packets: %d/%d (%.1f%%)' % (num_retrans, + len(packets), 100*num_retrans/len(packets))) + print('') + + print('A packet is considered to be "lost" if it has been transmitted') + print('but there is no evidence that it was ever received (presumably') + print('it has not been retransmitted).') + print('Node: Name of a node') + print('TxLost: Lost packets sent from this node') + print('RxLost: Lost packets destined to this node') + print('Retrans: Number of packets retransmitted by node') + + print('\nNode TxLost RxLost Retrans') + print('--------------------------------') + for node in get_sorted_nodes(): + print('%-10s %6d %6d %6d' % (node, self.tx_lost[node], + self.rx_lost[node], self.retransmits[node])) + + print('\nXmit TxNode RxNode RxCore RpcId Offset') + print('----------------------------------------------------------') + prev_xmit = 1e20 + prev_tx_node = '' + prev_rx_node = '' + prev_core = -1 + prev_id = 0 + for pkt in sorted(self.lost_pkts, key=lambda p : p['xmit']): + xmit = pkt['xmit'] + if xmit == prev_xmit: + xmit_info = '' + else: + xmit_info = '%.3f' % (xmit) + prev_xmit = xmit + + tx_node = pkt['tx_node'] + rx_node = get_rpc_node(pkt['id']^1) + if not rx_node in self.rx_core[tx_node]: + core_info = "???" + prev_core = -1 + else: + core = self.rx_core[tx_node][rx_node] + if core == prev_core: + core_info = '' + else: + core_info = '%d' % (core) + prev_core = core + + if tx_node == prev_tx_node: + tx_node = '' + else: + prev_tx_node = tx_node + + if rx_node == prev_rx_node: + rx_node = '' + else: + prev_rx_node = rx_node + + id = pkt['id'] + if id == prev_id: + id_info = '' + else: + id_info = '%d' % (id) + prev_id = id + + print('%9s %-10s %-10s %4s %12s %6d' % (xmit_info, tx_node, + rx_node, core_info, id_info, pkt['offset'])) + +#------------------------------------------------ +# Analyzer: minlatency +#------------------------------------------------ +class AnalyzeMinlatency: + """ + Analyzes packet information to compute the minimum one-way latency + between each pair of nodes. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzePackets') + + def analyze(self): + global grants, min_latency, packets + + nodes = get_sorted_nodes() + for src in nodes: + min_latency[src] = {} + for dst in nodes: + min_latency[src][dst] = math.inf + + for pkt in itertools.chain(packets.values(), grants.values()): + if not 'nic' in pkt or not 'gro' in pkt: + continue + delta = pkt['gro'] - pkt['nic'] + if delta < min_latency[pkt['tx_node']][pkt['rx_node']]: + min_latency[pkt['tx_node']][pkt['rx_node']] = delta + + def output(self): + global min_latency + + print('\n--------------------') + print('Analyzer: minlatency') + print('--------------------') + + print('\nMinimum one-way latency (microseconds) from when a packet ' + 'was queued') + print('for the NIC on a source node (rows) until it was received by ' + 'GRO on') + print('the destination node:') + + nodes = get_sorted_nodes() + print(' '*10, end='') + for dst in nodes: + print('%10s' % (dst), end='') + print('') + for src in nodes: + line = '%-10s' % (src) + for dst in nodes: + t = min_latency[src][dst] + if t == math.inf: + line += ' '*10 + else: + line += '%10.1f' % (t) + print(line.rstrip()) + +#------------------------------------------------ +# Analyzer: msgrange +#------------------------------------------------ +class AnalyzeMsgrange: + """ + Selects messages within a given range of lengths (--min and --max options), + divides those messages into a few percentile ranges, then prints statistics + on delays for each phase of message transmission for each percentile range. + """ + + def __init__(self, dispatcher): + require_options('msgrange', 'min', 'max') + dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzePackets') + + def get_stats(self, tx_rpcs): + """ + Given a collection of RPCs, returns a dictionary with the following + elements, each of which is a list of elapsed times extracted from all + of the messages or their packets: + elapsed: Times from sendmsg to recvmsg_done + xmit: Times from sendmsg to first packet xmit + nic: Times from xmit to nic for all packets + free: Times from nic to free for all packets + gro: Times from nic to gro for all packets + softirq: Times from gro to softirq for all packets + notify: Times from last softirq to recvmsg_done + """ + + global rpcs + elapsed = [] + xmit = [] + nic = [] + free = [] + gro = [] + softirq = [] + notify = [] + + for tx_rpc in tx_rpcs: + rx_rpc = rpcs[tx_rpc['id']^1] + max_softirq = -1e20 + elapsed.append(rx_rpc['recvmsg_done'] - tx_rpc['sendmsg']) + xmit.append(tx_rpc['send_data_pkts'][0]['xmit'] - tx_rpc['sendmsg']) + for pkt in tx_rpc['send_data_pkts']: + if ('nic' in pkt) and ('xmit' in pkt): + nic.append(pkt['nic'] - pkt['xmit']) + if ('free' in pkt) and ('nic' in pkt): + free.append(pkt['free'] - pkt['nic']) + if ('gro' in pkt) and ('nic' in pkt): + gro.append(pkt['gro'] - pkt['nic']) + if 'softirq' in pkt: + if 'gro' in pkt: + softirq.append(pkt['softirq'] - pkt['gro']) + max_softirq = max(max_softirq, pkt['softirq']) + if max_softirq != -1e20: + notify.append(rx_rpc['recvmsg_done'] - max_softirq) + return {'elapsed': elapsed, 'xmit': xmit, 'nic': nic, 'free': free, + 'gro': gro, 'softirq': softirq, 'notify': notify} + + def output(self): + global rpcs, packets, traces, options + + # List of where elapsed it time from sendmsg to + # recvmsg_done and rpc is the RPC that sent the message. + msgs = [] + + # Collect messages within the desired range of elapsed times. + # We must have "complete" information for each message. + for tx_rpc in rpcs.values(): + id = tx_rpc['id'] + if id^1 not in rpcs: + continue + rx_rpc = rpcs[id^1] + if not 'sendmsg' in tx_rpc: + continue + start = tx_rpc['sendmsg'] + if not 'recvmsg_done' in rx_rpc: + continue + end = rx_rpc['recvmsg_done'] + elapsed = end - start + if traces[rx_rpc['node']]['first_time'] > start: + continue + if traces[tx_rpc['node']]['last_time'] < end: + continue + length = tx_rpc['out_length'] + if (length < options.min) or (length >= options.max): + continue + msgs.append([elapsed, tx_rpc]) + + msgs.sort(key=lambda t: t[0]) + sorted_rpcs = list(map(lambda t : t[1], msgs)) + stats = {} + for key, pmin, pmax in [ + ['all', 0, 100], + ['p10', 0, 10], + ['p50', 40, 60], + ['p90', 90, 95], + ['p99', 99, 100]]: + first = pmin*len(sorted_rpcs)//100 + last = pmax*len(sorted_rpcs)//100 + stats[key] = self.get_stats(sorted_rpcs[first:last]) + + print('\n------------------') + print('Analyzer: msgrange') + print('------------------') + print('') + print('Delay statistics for messages with sizes between %d and %d ' + 'bytes,' % (options.min, options.max)) + print('divided into various percentile ranges by total elapsed time:') + print('Count: Total number of messages in this percentile range') + print('Elapsed: Average time from sendmsg to recvmsg_done') + print('Xmit: Average time from sendmsg to first NIC handoff') + print('Gro: Avg. time from NIC handoff to homa_gro_receive for packets') + print('Free: Avg. time from NIC handoff to tx packet free') + print('Softirq: Avg. time from homa_gro_receive to homa_softirq') + print('Notify: Avg. time from last homa_softirq call to recvmsg_done') + print('') + print(' Count Elapsed Xmit Nic Gro Free Softirq Notify') + print('-------------------------------------------------------------------') + for key, label in [ + ['all', 'All'], + ['p10', 'P0-10'], + ['p50', 'P40-60'], + ['p90', 'P90-95'], + ['p99', 'P99-100'], + ]: + s = stats[key] + print('%-10s %5d %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f' % ( + label, len(s['elapsed']), + sum(s['elapsed'])/len(s['elapsed']), + sum(s['xmit'])/len(s['xmit']), + sum(s['nic'])/len(s['nic']), + sum(s['gro'])/len(s['gro']), + sum(s['free'])/len(s['free']), + sum(s['softirq'])/len(s['softirq']), + sum(s['notify'])/len(s['notify'])) + ) + +#------------------------------------------------ +# Analyzer: net +#------------------------------------------------ +class AnalyzeNet: + """ + Prints information about delays in the network including NICs, network + delay and congestion, and receiver GRO overload. With --data, generates + data files describing backlog and delay over time on a core-by-core + basis. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeRpcs') + return + + def collect_events(self): + """ + Matches up packet sends and receives for all RPCs to return a + dictionary that maps from the name for a receiving node to a + list of events for that receiver. Each event is a + list: + time: Time when the event occurred + event: What happened: "xmit" for packet transmission or "recv" + for packet reception (by GRO) + length: Number of message bytes in packet + core: Core where packet was processed by GRO + delay: End-to-end delay for packet; zero for xmit events + """ + + global rpcs, traces, options + receivers = defaultdict(list) + + # Process RPCs in sender-receiver pairs to collect data + for xmit_id, xmit_rpc in rpcs.items(): + recv_id = xmit_id ^ 1 + if not recv_id in rpcs: + continue + recv_rpc = rpcs[recv_id] + receiver = receivers[recv_rpc['node']] + if not 'gro_core' in recv_rpc: + continue + core = recv_rpc['gro_core'] + + xmit_pkts = sorted(xmit_rpc['send_data'], key=lambda t : t[1]) + if xmit_pkts: + xmit_end = xmit_pkts[-1][1] + xmit_pkts[-1][2] + elif 'out_length' in xmit_rpc: + xmit_end = xmit_rpc['out_length'] + elif 'in_length' in recv_rpc: + xmit_end = recv_rpc['in_length'] + else: + # Not enough info to process this RPC + continue + + recv_pkts = sorted(recv_rpc['gro_data'], + key=lambda tuple : tuple[1]) + xmit_ix = 0 + if xmit_pkts: + xmit_time, xmit_offset, xmit_length = xmit_pkts[0] + else: + xmit_offset = 100000000 + xmit_length = 0 + xmit_bytes = 0 + for i in range(0, len(recv_pkts)): + recv_time, recv_offset, prio = recv_pkts[i] + length = get_recv_length(recv_offset, xmit_end) + + while recv_offset >= (xmit_offset + xmit_length): + if xmit_bytes: + receiver.append([xmit_time, "xmit", xmit_bytes, + core, 0.0]) + xmit_ix += 1 + if xmit_ix >= len(xmit_pkts): + break + xmit_time, xmit_offset, xmit_length = xmit_pkts[xmit_ix] + xmit_bytes = 0 + if recv_offset < xmit_offset: + # No xmit record; skip + continue + if xmit_ix >= len(xmit_pkts): + # Receiver trace extends beyond sender trace; ignore extras + break + + # Skip retransmitted packets (too hard to account for). + retransmit = False + for resend in recv_rpc['resend_rx']: + if resend[1] == recv_offset: + retransmit = True + break + if retransmit or (recv_offset in xmit_rpc['retransmits']): + continue + receiver.append([recv_time, "recv", length, core, + recv_time - xmit_time]) + if recv_time < xmit_time and not options.negative_ok: + print('%9.3f Negative delay, xmit_time %9.3f, ' + 'xmit_node %s recv_node %s recv_offset %d ' + 'xmit_offset %d xmit_length %d' + % (recv_time, xmit_time, xmit_rpc['node'], + recv_rpc['node'], recv_offset, xmit_offset, + xmit_length), file=sys.stderr) + xmit_bytes += length + if xmit_bytes: + receiver.append([xmit_time, "xmit", xmit_bytes, core, 0.0]) + + for name, receiver in receivers.items(): + receiver.sort(key=lambda tuple : tuple[0]) + return receivers + + def summarize_events(self, events): + """ + Given a dictionary returned by collect_events, return information + about each GRO core as a dictionary indexed by node names. Each + element is a dictionary indexed by cores, which in turn is a + dictionary with the following values: + num_packets: Total number of packets received by the core + avg_delay: Average end-to-end delay for packets + max_delay: Worst-case end-to-end delay + max_delay_time: Time when max_delay occurred + avg_backlog: Average number of bytes of data in transit + max_backlog: Worst-case number of bytes of data in transit + max_backlog_time: Time when max_backlog occurred + """ + global options + + stats = defaultdict(lambda: defaultdict(lambda: { + 'num_packets': 0, + 'avg_delay': 0, + 'max_delay': 0, + 'max_delay_time': 0, + 'avg_backlog': 0, + 'max_backlog': 0, + 'cur_backlog': 0, + 'prev_time': 0})) + + for name, node_events in events.items(): + node = stats[name] + for event in node_events: + time, type, length, core, delay = event + core_data = node[core] + core_data['avg_backlog'] += (core_data['cur_backlog'] * + (time - core_data['prev_time'])) + if type == "recv": + core_data['num_packets'] += 1 + core_data['avg_delay'] += delay + if delay > core_data['max_delay']: + core_data['max_delay'] = delay + core_data['max_delay_time'] = time + if core_data['cur_backlog'] == core_data['max_backlog']: + core_data['max_backlog_time'] = time + core_data['cur_backlog'] -= length + if (delay < 0) and not options.negative_ok: + print('Negative delay: %s' % (event)) + else: + core_data['cur_backlog'] += length + if core_data['cur_backlog'] > core_data['max_backlog']: + core_data['max_backlog'] = core_data['cur_backlog'] + core_data['prev_time'] = time + for core_data in node.values(): + core_data['avg_delay'] /= core_data['num_packets'] + core_data['avg_backlog'] /= traces[name]['elapsed_time'] + return stats + + def generate_delay_data(self, events, dir): + """ + Creates data files for the delay information in events. + + events: Dictionary of events returned by collect_events. + dir: Directory in which to write data files (one file per node) + """ + + for node, node_events in events.items(): + # Core number -> list of tuples for that core. Each + # tuple indicates when a packet was processed by GRO on that core, + # and the packet's end-to-end delay. The list for each core is + # sorted in increasing time order. + core_data = defaultdict(list) + for event in node_events: + event_time, type, length, core, delay = event + if type != "recv": + continue + core_data[core].append([event_time, delay]) + + cores = sorted(core_data.keys()) + max_len = 0 + for core in cores: + length = len(core_data[core]) + if length > max_len: + max_len = length + + f = open('%s/net_delay_%s.dat' % (dir, node), 'w') + f.write('# Node: %s\n' % (node)) + f.write('# Generated at %s.\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + doc = ('# Packet delay information for a single node, broken ' + 'out by the core ' + 'where the packet is processed by GRO. For each active core ' + 'there are two columns, TimeN and ' + 'DelayN. Each line corresponds to a packet that was processed ' + 'by homa_gro_receive on core N at the given time with ' + 'the given delay ' + '(measured end to end from ip_*xmit call to homa_gro_receive ' + 'call)') + f.write('\n# '.join(textwrap.wrap(doc))) + f.write('\n') + for core in cores: + t = 'Time%d' % core + d = 'Delay%d' % core + f.write('%8s%8s' % (t, d)) + f.write('\n') + for i in range(0, max_len): + for core in cores: + pkts = core_data[core] + if i >= len(pkts): + f.write('' * 15) + else: + f.write('%8.1f %7.1f' % (pkts[i][0], pkts[i][1])) + f.write('\n') + f.close() + + def generate_backlog_data(self, events, dir): + """ + Creates data files for per-core backlog information + + events: Dictionary of events returned by collect_events. + dir: Directory in which to write data files (one file per node) + """ + global options + + for node, node_events in events.items(): + # Core number -> list; entry i in the list is the backlog on that + # core at the end of interval i. + backlogs = defaultdict(list) + + interval_length = 20.0 + start = (node_events[0][0]//interval_length) * interval_length + interval_end = start + interval_length + cur_interval = 0 + + for event in node_events: + event_time, type, length, core, delay = event + while event_time >= interval_end: + interval_end += interval_length + cur_interval += 1 + for core_intervals in backlogs.values(): + core_intervals.append(core_intervals[-1]) + + if not core in backlogs: + backlogs[core] = [0] * (cur_interval+1) + if type == "recv": + backlogs[core][-1] -= length + else: + backlogs[core][-1] += length + + cores = sorted(backlogs.keys()) + + f = open('%s/net_backlog_%s.dat' % (dir, node), "w") + f.write('# Node: %s\n' % (node)) + f.write('# Generated at %s.\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + doc = ('# Time-series history of backlog for each active ' + 'GRO core on this node. "Total" gives the total backlog ' + 'across all cores, and "BackC" shows the backlog ' + 'on core C at the given time (in usec). Backlog ' + 'is the KB of data destined ' + 'for a core that have been passed to ip*_xmit at the sender ' + 'but not yet seen by homa_gro_receive on the receiver.') + f.write('\n# '.join(textwrap.wrap(doc))) + f.write('\n Time Total') + for core in cores: + f.write(' %7s' % ('Back%d' % core)) + f.write('\n') + for i in range(0, cur_interval): + f.write('%8.1f' % (start + (i+1)*interval_length)) + total = 0 + for core in cores: + total += backlogs[core][i] / 1000 + f.write(' %7.1f' % total) + for core in cores: + f.write(' %7.1f' % (backlogs[core][i] / 1000)) + f.write('\n') + f.close() + + def output(self): + global rpcs, traces, options + + events = self.collect_events() + + if options.data != None: + self.generate_delay_data(events, options.data) + self.generate_backlog_data(events, options.data) + + stats = self.summarize_events(events) + + print('\n--------------') + print('Analyzer: net') + print('--------------') + print('Network delay (including sending NIC, network, receiving NIC, and GRO') + print('backup) for packets with GRO processing on a particular core.') + print('Node: Receiving node for packets') + print('Core: Core identifier on Node') + print('Pkts: Total incoming data packets processed by Core on Node') + print('AvgDelay: Average end-to-end delay from ip_*xmit invocation to ' + 'GRO (usec)') + print('MaxDelay: Maximum end-to-end delay, and the time when the max packet was') + print(' processed by GRO (usec)') + print('AvgBack: Average backup for Core on Node (total data bytes that were') + print(' passed to ip_*xmit but not yet seen by GRO) (KB)') + print('MaxBack: Maximum backup for Core (KB) and the time when GRO processed') + print(' a packet from that backup') + print('') + print('Node Core Pkts AvgDelay MaxDelay (Time) ' + 'AvgBack MaxBack (Time)') + print('---------------------------------------------------------' + '-----------------------------', end='') + for name in get_sorted_nodes(): + if not name in stats: + continue + node = stats[name] + print('') + for core in sorted(node.keys()): + core_data = node[core] + print('%-10s %4d %6d %9.1f %9.1f (%10.3f) %8.1f %8.1f (%10.3f)' % ( + name, core, core_data['num_packets'], + core_data['avg_delay'], core_data['max_delay'], + core_data['max_delay_time'], + core_data['avg_backlog'] * 1e-3, + core_data['max_backlog'] * 1e-3, + core_data['max_backlog_time'])) + +#------------------------------------------------ +# Analyzer: nicbacklog +#------------------------------------------------ +class AnalyzeNicbacklog: + """ + Prints a time-series analysis of backlog in the NIC (packets that + have been passed to the NIC but not yet returned after transmission) + along with information about the rate of packets flowing into the + NIC and being returned from the NIC. Requires the --data option. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeTcppackets') + require_options('nicbacklog', 'data') + + def output(self): + global packets, tcp_packets, options, traces + + # Microseconds in the smallest interval we'll consider for + # computing rates. + base_interval = 50 + + # node -> list of packets transmitted by that node + node_pkts = defaultdict(list) + + # node -> list of interval stats for the largest size interval + # (4*base_interval) on that node. Each list element consists of + # : + # time: End time of the interval + # nic_pkts: # packets owned by NIC at time + # nic_bytes: Bytes of data in packets owned by NIC at time + # free_pkts: # packets returned to Linux in the interval + # free_bytes: Bytes of data in free_pkts + node_intervals = defaultdict(list) + + # node -> running sum of packets owned by NIC on that node * time. + node_nic_pkts = defaultdict(lambda : 0) + + # node -> running sum of bytes owned by NIC on that node * time. + node_nic_bytes = defaultdict(lambda : 0) + + # node -> total bytes passed to NIC over trace + node_total_in = defaultdict(lambda: 0) + + # node -> total bytes freed after return from NIC over trace + node_total_freed = defaultdict(lambda: 0) + + print('\n--------------------') + print('Analyzer: nicbacklog') + print('--------------------') + print('See data files %s/nicbacklog_*.dat' % (options.data)) + print('\nSummary data for each node:') + print('Node: Name of node') + print('AvgPkts: Average # packets owned by NIC at one time') + print('AvgKB: Average Kbytes of data in packets owned by NIC') + print('MaxPkts: Maximum packets owned by NIC at one time') + print('MaxKB: Maximum Kbytes of data in packets owned by NIC ' + 'at one time') + print('MaxInP: Maximum packets passed to NIC in a %d usec interval' % + (4 * base_interval)) + print('MaxInD: Maximum data rate from pkts passed to NIC in a %d ' + 'usec interval (Gbps)' % (4 * base_interval)) + print('MaxFrP: Maximum packets freed in a %d usec interval after ' + 'return from NIC' % (4 * base_interval)) + print('MaxFrD: Maximum data rate from pkts freed in a %d usec ' + 'interval (Gbps)' % (4 * base_interval)) + print('AvgIn: Average rate of data handed off to NIC (Gbps)') + print('AvgFr: Average rate of data freed after return from NIC (Gbps)') + print() + print('Node AvgPkts AvgKB MaxPkts MaxKB MaxInP MaxInD MaxFrP MaxFrD AvgIn AvgFr') + print('-------------------------------------------------------------------------------------') + + # Bucket all of the packets by transmitting node. Also compute + # average backlog data (this calculation will consider packets + # that don't have enough data to use in later calculations). + for pkt in itertools.chain(packets.values(), tcp_packets.values()): + if not 'tso_length' in pkt or not pkt['tx_node']: + continue + length = pkt['tso_length'] + node = pkt['tx_node'] + if 'nic' in pkt: + t1 = pkt['nic'] + node_total_in[node] += length + else: + if not 'free_tx_skb' in pkt: + continue + t1 = traces[node]['first_time'] + if 'free_tx_skb' in pkt: + t2 = pkt['free_tx_skb'] + node_total_freed[node] += length + else: + t2 = traces[node]['last_time'] + delta_t = t2 - t1 + node_nic_pkts[node] += delta_t + node_nic_bytes[node] += delta_t * length + + if 'nic' in pkt and 'free_tx_skb' in pkt: + node_pkts[node].append(pkt) + + # Each iteration in this loops generates data for one node. + for node in get_sorted_nodes(): + f = open('%s/nicbacklog_%s.dat' % (options.data, node), 'w') + f.write('# Node: %s\n' % (node)) + f.write('# Generated at %s.\n\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + f.write('# NIC backlog (packets passed to the NIC but not yet ' + 'returned to the\n') + f.write('# kernel) as a function of time\n') + f.write('# Time: Time of measurement (usecs)\n') + f.write('# NicPkts: Packets currently owned by NIC\n') + f.write('# NicKB: Kbytes of data in packets currently owned by NIC\n') + + f.write('# %-8s Packets passed to NIC in last %d usecs\n' % + ('InP%d:' % (base_interval), base_interval)) + f.write('# %-8s Data rate from packets passed to NIC in last %d ' + 'usecs (Gbps)\n' % + ('InB%d:' % (base_interval), base_interval)) + f.write('# %-8s Packets freed in last %d usecs after return from ' + 'NIC\n' % + ('FrP%d:' % (base_interval), base_interval)) + f.write('# %-8s Data rate from packets freed in last %d usecs ' + '(Gbps)\n' % + ('FrB%d:' % (base_interval), base_interval)) + + f.write('# %-8s Packets passed to NIC in last %d usecs\n' % + ('InP%d:' % (2*base_interval), 2*base_interval)) + f.write('# %-8s Data rate from packets passed to NIC in last %d ' + 'usecs (Gbps)\n' % + ('InB%d:' % (2*base_interval), 2*base_interval)) + f.write('# %-8s Packets freed in last %d usecs after return from ' + 'NIC\n' % + ('FrP%d:' % (2*base_interval), 2*base_interval)) + f.write('# %-8s Data rate from packets freed in last %d usecs ' + '(Gbps)\n' % + ('FrB%d:' % (2*base_interval), 2*base_interval)) + + f.write('# %-8s Packets passed to NIC in last %d usecs (M/sec)\n' % + ('InP%d:' % (4*base_interval), 4*base_interval)) + f.write('# %-8s Data rate from packets passed to NIC in last %d ' + 'usecs (Gbps)\n' % + ('InB%d:' % (4*base_interval), 4*base_interval)) + f.write('# %-8s Packets freed in last %d usecs after return from ' + 'NIC (M packets/sec)\n' % + ('FrP%d:' % (4*base_interval), 4*base_interval)) + f.write('# %-8s Data rate from packets freed in last %d usecs ' + '(Gbps)\n' % + ('FrB%d:' % (4*base_interval), 4*base_interval)) + + f.write('\nTime NicPkts NicKB') + for i in [base_interval, base_interval*2, base_interval*4]: + f.write(' %6s' % ('InP%d' % (i))) + f.write(' %7s' % ('InB%d' % (i))) + f.write(' %6s' % ('FrP%d' % (i))) + f.write(' %7s' % ('FrB%d' % (i))) + f.write('\n') + + # heapq of all active packets (those that are currently in + # the posession of the NIC) in increasing order of free time. + # Entries are where free is the packet's + # free_tx_skb time (for sorting), index is the packet's index + # in the list of all packets (for resolving sorting ties), and + # pkt is the packet. + active = [] + + # list of for each of + # 4 intervals, where intervals[0] is the newest interval. + # in_pkts: packets passed to the NIC in the interval + # in_bytes: bytes of data in packets passed to the NIC + # free_pkts: packets returned to Linux and freed in the interval + # free_bytes: bytes of data in packets freed in the interval + intervals = deque() + for _ in range(4): + intervals.appendleft([0, 0, 0, 0]) + + # End of the current interval (the next one to be added to + # intervals) + interval_end = 0 + + # Maximum values for any of the largest size interval. + max_pkts = 0 + max_bytes = 0 + max_in_pkts = 0 + max_in_bytes = 0 + max_free_pkts = 0 + max_free_bytes = 0 + + # Bytes and packets owned by the NIC as of current time + nic_pkts = 0 + nic_bytes = 0 + + pkts = sorted(node_pkts[node], key = lambda pkt : pkt['nic']) + interval_end = (math.ceil(pkts[0]['nic'] / base_interval) * + base_interval) + interval_ix = 0 + cur = 0 + # print('\n%s: %d packets:' % (node, len(node_pkts[node]))) + + # Each iteration of this loop handles a new interval. + while cur < len(pkts) or len(active) > 0: + in_pkts = 0 + in_bytes = 0 + free_pkts = 0 + free_bytes = 0 + + while cur < len(pkts) and pkts[cur]['nic'] <= interval_end: + pkt = pkts[cur] + # print('\n%9.3f: to Nic: %s' % (pkt['nic'], pkt['free_tx_skb'])) + cur += 1 + in_pkts += 1 + in_bytes += pkt['tso_length'] + get_hdr_length(pkt) + heapq.heappush(active, [pkt['free_tx_skb'], cur, pkt]) + while len(active) > 0 and active[0][0] < interval_end: + pkt = heapq.heappop(active)[2] + # print('\n%9.3f: freed: %s' % (pkt['free_tx_skb'], pkt)) + free_pkts += 1 + free_bytes += pkt['tso_length'] + get_hdr_length(pkt) + + nic_pkts += in_pkts - free_pkts + nic_bytes += in_bytes - free_bytes + intervals.pop() + intervals.appendleft([in_pkts, in_bytes, free_pkts, free_bytes]) + + # print('%7.1f: %8d %8d %8d %8d %8d %8d' % (interval_end, + # in_pkts, in_bytes, free_pkts, free_bytes, + # in_pkts - free_pkts, in_bytes - free_bytes)) + + f.write('%7.1f %5d %6d' % (interval_end, nic_pkts, + nic_bytes/1000)) + f.write(' %6d %7.2f %6d %7.2f' % ( + in_pkts, in_bytes*8/(1000*base_interval), + free_pkts, free_bytes*8/(1000*base_interval))) + + interval_ix += 1 + if nic_bytes >= 200000: + node_intervals[node].append([interval_end, nic_pkts, + nic_bytes, free_pkts, free_bytes]) + + in_pkts += intervals[1][0] + in_bytes += intervals[1][1] + free_pkts += intervals[1][2] + free_bytes += intervals[1][3] + f.write(' %6d %7.2f %6d %7.2f' % ( + in_pkts, in_bytes*8/(2000*base_interval), + free_pkts, free_bytes*8/(2000*base_interval))) + in_pkts += intervals[2][0] + intervals[3][0] + in_bytes += intervals[2][1] + intervals[3][1] + free_pkts += intervals[2][2] + intervals[3][2] + free_bytes += intervals[2][3] + intervals[3][3] + f.write(' %6d %7.2f %6d %7.2f' % ( + in_pkts, in_bytes*8/(4000*base_interval), + free_pkts, free_bytes*8/(4000*base_interval))) + f.write('\n') + + # Update maximum values + if nic_pkts > max_pkts: + max_pkts = nic_pkts + if nic_bytes > max_bytes: + max_bytes = nic_bytes + if in_pkts > max_in_pkts: + max_in_pkts = in_pkts + if in_bytes > max_in_bytes: + max_in_bytes = in_bytes + if free_pkts > max_free_pkts: + max_free_pkts = free_pkts + if free_bytes > max_free_bytes: + max_free_bytes = free_bytes + + interval_end += base_interval + f.close() + node_time = traces[node]['last_time'] - traces[node]['first_time'] + print('%-10s %6d %6d %7d %6d %6d %7.2f %6d %7.2f %7.2f %7.2f' % (node, + node_nic_pkts[node]/node_time, + 1e-3*node_nic_bytes[node]/node_time, + max_pkts, max_bytes/1000, + max_in_pkts, max_in_bytes*8/(4000*base_interval), + max_free_pkts, max_free_bytes*8/(4000*base_interval), + node_total_in[node]*8e-3/node_time, + node_total_freed[node]*8e-3/node_time)) + + # Output a table showing stats for the intervals with the highest + # and lowest free_bytes. + print() + print('Average interval statistics for each node, measured over %d ' + 'usec intervals.' % (base_interval)) + print('For each node, intervals with at least 200 Kbytes of NIC data ' + 'are selected;') + print('from this group the 10% slowest intervals (those with fewest ' + 'bytes freed)') + print('and 10% fastest intervals (those with the most NIC bytes freed)' + ' are selected,') + print('and the following statistics are printed from each group:') + print('Node: Name of node') + print('NicPS: Average # packets owned by NIC at the end of ' + 'slow intervals') + print('NicKbS: Average KB of data owned by NIC at the end of ' + 'slow intervals') + print('FreePS: Average # packets freed during slow intervals') + print('FreeKbs: Average KB of data freed during slow intervals') + print('NicPS: Average # packets owned by NIC at the end of ' + 'fast intervals') + print('NicKbS: Average KB of data owned by NIC at the end of ' + 'fast intervals') + print('FreePS: Average # packets freed during fast intervals') + print('FreeKbs: Average KB of data freed during fast intervals') + print() + print('Node NicPS NicKbS FreePS FreeKbS NicPF NicKbF FreePF FreeKbF') + print('-----------------------------------------------------------------------') + for node in get_sorted_nodes(): + intervals = sorted(node_intervals[node], key=lambda t: t[4]) + slow = intervals[0:(len(intervals)//10)] + fast = intervals[len(9*intervals)//10:] + if len(intervals) < 10: + print('%s has only %d intervals' % (node, len(intervals))) + continue + print('%-10s %5.1f %6.1f %5.1f %6.1f %5.1f %6.1f %5.1f %6.1f' % + (node, + sum(t[1] for t in slow)/len(slow), + sum(t[2] for t in slow)/(1000*len(slow)), + sum(t[3] for t in slow)/len(slow), + sum(t[4] for t in slow)/(1000*len(slow)), + sum(t[1] for t in fast)/len(fast), + sum(t[2] for t in fast)/(1000*len(fast)), + sum(t[3] for t in fast)/len(fast), + sum(t[4] for t in fast)/(1000*len(fast)),)) + +#------------------------------------------------ +# Analyzer: nicbacklog2 +#------------------------------------------------ +class AnalyzeNicbacklog2: + """ + Prints a time-series analysis of backlog in the NIC (packets that + have been passed to the NIC but not yet returned after transmission). + This differs from the nicbacklog analyzer in that it analyzes + the distribution of traffic between device queues. Requires the + --data option. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeTcppackets') + require_options('nicbacklog2', 'data') + + def output(self): + global packets, tcp_packets, options, traces + + # node -> list of packets transmitted by that node + node_pkts = defaultdict(list) + + # Bucket all of the packets by transmitting node. + for pkt in itertools.chain(packets.values(), tcp_packets.values()): + if not 'tso_length' in pkt or not pkt['tx_node']: + continue + if not 'tx_qid' in pkt: + continue + length = pkt['tso_length'] + node = pkt['tx_node'] + if 'nic' in pkt or 'free_tx_skb' in pkt: + node_pkts[node].append(pkt) + + # Each iteration in this loops generates data for one node. + for node in get_sorted_nodes(): + f = open('%s/nicbacklog2_%s.dat' % (options.data, node), 'w') + f.write('# Node: %s\n' % (node)) + f.write('# Generated at %s.\n\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + f.write('# Statistics on NIC backlog (packets handed off to the ' + 'NIC for transmission\n') + f.write('# but not yet returned to the kernel after xmit):\n') + f.write('# Time: End of interval (usecs)\n') + f.write('# Pkts: Total # packets owned by NIC\n') + f.write('# KB: Total kbytes of packet data owned by NIC\n') + f.write('# TcpKB: Total kbytes of TCP packet data in NIC\n') + f.write('# PktQs: Number of queues with packets\n') + f.write('# FreeQs: Number of queues for which a packet was freed\n') + f.write('# in the interval\n') + f.write('# Qid1: Id of queue with the most data\n') + f.write('# Pkts1: Number of packets in Qid1\n') + f.write('# KB1: Kbytes of packet data in Qid1 (Homa and TCP)\n') + f.write('# TcpKB1: Kbytes of TCP packet data in Qid1\n') + f.write('# Qid2, etc: Info for queue with second most data\n') + f.write('# Qid3, etc: Info for queue with third most data\n') + f.write('\n') + f.write('Time Pkts KB TcpKB PktQs FreeQs ' + 'Qid1 Pkts1 KB1 TcpKB1 ' + 'Qid2 Pkts2 KB2 TcpKB2 ' + 'Qid3 Pkts3 KB3 TcpKB3\n') + + # heapq of all active packets (those that are currently in + # the posession of the NIC) in increasing order of free time. + # Entries are where free is the packet's + # free_tx_skb time (for sorting), index is the packet's index + # in the list of all packets (for resolving sorting ties), and + # pkt is the packet. + active = [] + + # queue -> 1 for all queues for which a packet was freed (i.e. + # transmitted) in the current interval. + queues_freed = {} + + # queue -> count of bytes currently owned by this queue. + queue_bytes = defaultdict(lambda: 0) + + # queue -> same as queue_bytes except count only TCP bytes. + queue_tcp_bytes = defaultdict(lambda: 0) + + # queue -> count of packets currently owned by this queue. + queue_packets = defaultdict(lambda: 0) + + pkts = sorted(node_pkts[node], key = lambda pkt : + pkt['nic'] if 'nic' in pkt else -1e20) + cur = 0 + t = traces[node]['first_time'] + interval_end = (math.ceil(traces[node]['first_time'] / + options.interval) * options.interval) + + # Each iteration of this loop processes one event: either a + # packet handed off to the NIC or a packet freed. + while True: + # Decide on next event + if cur < len(pkts): + pkt = pkts[cur] + if 'nic' in pkt: + nic = pkt['nic'] + else: + nic = traces[node]['first_time'] + else: + nic = None + if nic != None and (not active or active[0][0] > nic): + free = False + t = nic + cur += 1 + elif active: + t, _, pkt = heapq.heappop(active) + free = True + else: + break + + # Handle end of interval(s) + while t >= interval_end: + # Format info for the queues with the most data. + details = '' + printed = 0 + nonzero = 0 + for qid, bytes in sorted(queue_bytes.items(), + key = lambda t: t[1], reverse = True): + if bytes == 0: + break + nonzero += 1 + if printed < 3: + details += ' %4d %5d %5d %6d' % (qid, + queue_packets[qid], queue_bytes[qid]//1000, + queue_tcp_bytes[qid]//1000) + printed += 1 + + # Generate output line + f.write('%7.1f %5d %5d %5d %5d %6d%s\n' % ( + interval_end, sum(queue_packets.values()), + sum(queue_bytes.values())//1000, + sum(queue_tcp_bytes.values())//1000, + nonzero, len(queues_freed), details)) + + interval_end += options.interval + queues_freed.clear() + + # Update statistics with current event + qid = pkt['tx_qid'] + if free: + queues_freed[qid] = 1 + queue_packets[qid] -= 1 + queue_bytes[qid] -= pkt['tso_length'] + if pkt['type'] == 'tcp': + queue_tcp_bytes[qid] -= pkt['tso_length'] + if queue_packets[qid] == 0: + if queue_bytes[qid] != 0: + raise Exception('%9.3f: queue_bytes[%d] is %d but ' + 'queue_packets is 0' % (t, qid, + queue_bytes[qid])) + else: + queue_packets[qid] += 1 + queue_bytes[qid] += pkt['tso_length'] + if pkt['type'] == 'tcp': + queue_tcp_bytes[qid] += pkt['tso_length'] + free = (pkt['free_tx_skb'] if 'free_tx_skb' in pkt else + traces[node]['last_time']) + heapq.heappush(active, [free, cur, pkt]) + + f.close() + + print('\n---------------------') + print('Analyzer: nicbacklog2') + print('---------------------') + print('See data files %s/nicbacklog2_*.dat' % (options.data)) + +#------------------------------------------------ +# Analyzer: nicpkts +#------------------------------------------------ +class AnalyzeNicpkts: + """ + Generate a history for each node of the packets owned by the NIC + (packets passed to the NIC but not yet returned after transmission), + showing the state of the NIC queues at each point in time and the + order in which packets are returned to Linux after transmission. + Requires the --data option. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeTcppackets') + require_options('nicpkts', 'data') + + def print_active(self, f, active, free_index): + """ + Print out the list of active NIC packets for a node. + f: File in which to print information + active: List of packets currently owned by NIC + free_index: Index in active of the next packet to be freed; + this packet will be highlighted in the printout + """ + + num_this_line = 0 + index = 0 + for pkt in active: + if num_this_line == 4: + f.write('\n') + num_this_line = 0 + elif num_this_line > 0: + f.write(' ') + if index == free_index: + f.write('%19s' % ('')) + else: + f.write('%9.3f:%3d:%5d' % (pkt['nic'], pkt['tx_qid'], + pkt['tso_length'])) + num_this_line += 1 + index += 1 + if num_this_line > 0: + f.write('\n') + + def output(self): + global packets, tcp_packets, options, traces + + # node -> list of packets transmitted by that node + node_pkts = defaultdict(list) + + print('\n--------------------') + print('Analyzer: nicpkts') + print('--------------------') + print('See data files %s/nicpkts*.dat' % (options.data)) + + # Bucket all of the packets by transmitting node. + for pkt in itertools.chain(packets.values(), tcp_packets.values()): + if (not 'nic' in pkt or not 'free_tx_skb' in pkt or + not 'tso_length' in pkt): + continue + node_pkts[pkt['tx_node']].append(pkt) + + # Each iteration in this loops generates data for one node. + for node in get_sorted_nodes(): + f = open('%s/nicpkts_%s.dat' % (options.data, node), 'w') + f.write('# Node: %s\n' % (node)) + f.write('# Generated at %s.\n\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + f.write('# Each block of lines shows the packets owned by the ' + 'NIC when the next\n') + f.write('# packet was returned to Linux after transmission. The ' + 'first line contains\n') + f.write('# the time when the next packet was freed, plus ' + 'information about the freed\n') + f.write('# packet.\n') + f.write('#\n') + f.write('# The following lines describe each of the packets ' + 'owned by the NIC as\n') + f.write('# a tuple :\n') + f.write('# t: Time when the packet was passed to the NIC\n') + f.write('# qid: Id of the NIC queue containing the packet\n') + f.write('# length: Bytes in the packet\n') + f.write('# The freed packet is not displayed in the list; ' + '\'\' is displayed\n') + f.write('# in its place to highlight its position in the list.\n') + + # Contains all packets currently owned by the NIC, in increasing + # order of nic time. + active = [] + + pkts = sorted(node_pkts[node], key = lambda pkt : pkt['nic']) + cur = 0 + + # Each iteration of this loop handles the next 'free_tx_skb' + # event, which includes adding packets to the active list, + # printing the list, and removing the next packet to be freed. + while cur < len(pkts): + # Find the first free time among active packets + min_free = 1e20 + min_index = -1 + for i in range(len(active)): + pkt = active[i] + if pkt['free_tx_skb'] < min_free: + min_index = i + min_free = pkt['free_tx_skb'] + + # Add more packets to the active list until the free time + # is reached; this could cause the first free time to change. + while cur < len(pkts): + pkt = pkts[cur] + if pkt['nic'] >= min_free: + break + cur += 1 + active.append(pkt) + if pkt['free_tx_skb'] < min_free: + min_index = len(active) - 1 + min_free = pkt['free_tx_skb'] + + # Print information about the active list and the packet + # that was just freed. + pkt = active[min_index] + f.write('\n%9.3f: qid %d, slot %d, %d bytes, queued at %.3f\n' % + (pkt['free_tx_skb'], pkt['tx_qid'], min_index, + pkt['tso_length'], pkt['nic'])) + self.print_active(f, active, min_index) + + del active[min_index] + f.close() + +#------------------------------------------------ +# Analyzer: nicqueues +#------------------------------------------------ +class AnalyzeNicqueues: + """ + Prints estimates of the amount of outbound packet data queued in the + NIC of each node, assuming that the NIC transmits at full link speed. + The --gbps option specifies the rate at which packets are transmitted. + With --data option, generates detailed timelines of NIC queue lengths. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzePackets') + + # Node name -> list of tuples for + # all transmitted packets. Length is the packet length including + # Homa/TCP header but not IP or Ethernet overheads. Queue_length is + # the # bytes in the NIC queue as of time (includes this packet). + # Queue_length starts off zero and is updated later. Type indicates + # the kind of packet: "homa_data", "homa_grant", or "tcp" + self.nodes = defaultdict(list) + + def tt_send_grant(self, trace, t, core, id, offset, priority, increment): + self.nodes[trace['node']].append([t, 34, 0, "homa_grant"]) + + def tt_tcp_xmit(self, trace, t, core, source, dest, data_bytes, seq_ack): + self.nodes[trace['node']].append([t, data_bytes, 0, "tcp"]) + + def output(self): + global options, traces, packets, dispatcher + + for pkt in packets.values(): + if 'tso_length' in pkt and 'nic' in pkt: + self.nodes[pkt['tx_node']].append([pkt['nic'], + pkt['tso_length'] + 60, 0, "homa_data"]) + for retrans in pkt['retransmits']: + if not 'nic' in retrans: + continue + if 'tso_length' in retrans: + length = retrans['tso_length'] + elif 'length' in pkt: + length = pkt['length'] + else: + continue + self.nodes[pkt['tx_node']].append([retrans['nic'], + length + 60, 0, "homa_data"]) + + print('\n-------------------') + print('Analyzer: nicqueues') + print('-------------------') + + # Compute queue lengths, find maximum for each node. + print('Worst-case length of NIC tx queue for each node, assuming a link') + print('speed of %.1f Gbps (change with --gbps):' % (options.gbps)) + print('Node: Name of node') + print('MaxLength: Highest estimated output queue length for NIC (bytes)') + print('Time: Time when worst-case queue length occurred') + print('Delay: Delay (usec until fully transmitted) experienced by packet') + print(' transmitted at Time') + print('P50: Median delay experienced by Homa data packets') + print('P90: 90th percentile delay experienced by Homa data packets') + print('P99: 99th percentile delay experienced by Homa data packets') + print('') + print('Node MaxLength Time Delay P50 P90 P99') + + for node in get_sorted_nodes(): + pkts = self.nodes[node] + if not pkts: + continue + pkts.sort() + max_queue = 0 + max_time = 0 + cur_queue = 0 + prev_time = traces[node]['first_time'] + for i in range(len(pkts)): + time, length, ignore, ignore2 = pkts[i] + + # 20 bytes for IPv4 header, 42 bytes for Ethernet overhead (CRC, + # preamble, interpacket gap) + total_length = length + 62 + + xmit_bytes = ((time - prev_time) * (1000.0*options.gbps/8)) + if xmit_bytes < cur_queue: + cur_queue -= xmit_bytes + else: + cur_queue = 0 + if 0 and (node == 'node6'): + if cur_queue == 0: + print('%9.3f (+%4.1f): length %6d, queue empty' % + (time, time - prev_time, total_length)) + else: + print('%9.3f (+%4.1f): length %6d, xmit %5d, queue %6d -> %6d' % + (time, time - prev_time, total_length, + xmit_bytes, cur_queue, cur_queue + total_length)) + cur_queue += total_length + if cur_queue > max_queue: + max_queue = cur_queue + max_time = time + prev_time = time + pkts[i][2] = cur_queue + data_pkts = sorted(filter(lambda t: t[3] == 'homa_data', pkts), + key=lambda t: t[2]) + print('%-10s %9d %9.3f %7.1f %7.1f %7.1f %7.1f' % ( + node, max_queue, max_time, + (max_queue*8)/(options.gbps*1000), + data_pkts[50*len(data_pkts)//100][2]*8/(options.gbps*1000), + data_pkts[90*len(data_pkts)//100][2]*8/(options.gbps*1000), + data_pkts[99*len(data_pkts)//100][2]*8/(options.gbps*1000))) + + if options.data: + # Print stats for each node at regular intervals + file = open('%s/nicqueues.dat' % (options.data), 'w') + line = 'Interval' + for node in get_sorted_nodes(): + line += ' %10s' % (node) + print(line, file=file) + + interval = options.interval + interval_end = get_first_interval_end() + end = get_last_time() + + # Node name -> current index in that node's packets + cur = {} + for node in get_sorted_nodes(): + cur[node] = 0 + + while True: + line = '%8.1f' % (interval_end) + for node in get_sorted_nodes(): + max = -1 + i = cur[node] + xmits = self.nodes[node] + while i < len(xmits): + time, ignore, queue_length, type = xmits[i] + if time > interval_end: + break + if queue_length > max: + max = queue_length + i += 1 + cur[node] = i + if max == -1: + line += ' ' * 11 + else: + line += ' %8d' % (max) + print(line.rstrip(), file=file) + if interval_end > end: + break + interval_end += interval + file.close() + +#------------------------------------------------ +# Analyzer: nicsnapshot +#------------------------------------------------ +class AnalyzeNicsnapshot: + """ + Print information about the state of the NIC queues on a particular + node at a particular point in time. Requires the --time and --node + options. If --verbose is specified then all of the packets in the + position of the NIC at the reference time are printed. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeTcppackets') + require_options('nicsnapshot', 'time', 'node') + + def output(self): + global options, packets, tcp_packets + + # Queue id -> packets in queue at reference time. + id_pkts = defaultdict(list) + + # Intervals to use for id_queued_interval and other variables. + intervals = [10, 20, 50, 100] + + # Queue id -> dict of interval -> count, where interval is a time + # in usecs and count is the number of bytes that were handed + # off to the NIC for that queue in the interval preceding the + # reference time. + id_queued_interval = defaultdict(lambda: defaultdict(lambda: 0)) + + # Queue id -> dict of interval -> count, where interval is a time + # in usecs and count is the number of bytes in packets for that + # queue that were freed after transmission in the interval preceding + # the reference time. + id_freed_interval = defaultdict(lambda: defaultdict(lambda: 0)) + + # Queue id -> last packet freed for that queue before the reference + # time + id_last_freed = {} + + # All packets active in the NIC at the reference time + all_active = [] + + # Scan all packets and fill in the variables above. + for pkt in itertools.chain(packets.values(), tcp_packets.values()): + if not pkt['tx_node'] or pkt['tx_node'] != options.node: + continue + if not 'tso_length' in pkt: + continue + length = pkt['tso_length'] + get_hdr_length(pkt) + qid = pkt['tx_qid'] if 'tx_qid' in pkt else 'unknown' + nic = pkt['nic'] if 'nic' in pkt else None + free = pkt['free_tx_skb'] if 'free_tx_skb' in pkt else None + if (nic != None and free != None and nic < options.time + and free > options.time): + all_active.append(pkt) + id_pkts[qid].append(pkt) + for i in intervals: + if (nic != None and nic < options.time and + nic >= (options.time - i)): + id_queued_interval[qid][i] += length + if (free != None and free < options.time and + free >= (options.time - i)): + id_freed_interval[qid][i] += length + if free != None and free < options.time: + if (not qid in id_last_freed or + id_last_freed[qid]['free_tx_skb'] < free): + id_last_freed[qid] = pkt + + intervals = [10, 20, 50, 100] + + print('\n---------------------') + print('Analyzer: nicsnapshot') + print('---------------------') + print('Information about the state of the NIC queues on %s at time %.3f:' + % (options.node, options.time)) + print('Qid: Identifier of transmit queue') + print('Pkts: Packets in Qid at the reference time') + print('KB: Kbytes of packet data in Qid at the reference time') + print('Oldest: Time when the oldest packet in Qid was handed ' + 'off to the NIC') + print('ODiff: Time difference between Oldest and reference time ' + '(usecs)') + print('RecFree: Most recent time when a packet for Qid was returned ' + 'to Linux and freed') + print('RFDiff: Time difference between RecFree and reference time ' + '(usecs)') + print('QXX: Kbytes of packet data handed off to NIC for Qid ' + 'in the XX usecs') + print(' precending the reference time') + print('FXX: Kbytes of packet data freed after transmission ' + 'in the XX usecs') + print(' precending the reference time') + print() + print(' Qid Pkts KB Oldest ODiff RecFree RFDiff', end='') + for i in intervals: + print('%7s' % ('Q%d' % (i)), end='') + for i in intervals: + print('%7s' % ('F%d' % (i)), end='') + print() + + # Interval (usecs) -> total bytes handed off to the NIC (for any + # queue) within that interval of the reference time. + total_interval_bytes = defaultdict(lambda: 0) + nic_oldest = {'nic': 1e20} + total_bytes = 0 + total_pkts = 0 + + qids = sorted(id_pkts.keys() | id_queued_interval.keys() | + id_freed_interval.keys()) + for qid in qids: + pkts = id_pkts[qid] + oldest_queued = None + q_bytes = 0 + total_pkts += len(pkts) + + for pkt in pkts: + nic = pkt['nic'] + if oldest_queued == None or nic < oldest_queued['nic']: + oldest_queued = pkt + if nic < nic_oldest['nic']: + nic_oldest = pkt + length = pkt['tso_length'] + get_hdr_length(pkt) + q_bytes += length + total_bytes += length + + if qid in id_last_freed: + t = id_last_freed[qid]['free_tx_skb'] + freed = '%.3f' % (t) + free_diff = '%.1f' % (options.time - t) + else: + freed = '' + free_diff = '' + if oldest_queued != None: + old_queue_time = '%.3f' % (oldest_queued['nic']) + odiff = '%.1f' % (options.time - oldest_queued['nic']) + else: + old_queue_time = '' + odiff = '' + line = '%5d %4d %6.1f %9s %7s %9s %7s' % (qid, len(pkts), + q_bytes*1e-3, old_queue_time, odiff, freed, free_diff) + for i in intervals: + bytes = id_queued_interval[qid][i] + if bytes != 0: + msg = '%.1f' % (bytes*1e-3) + else: + msg = '' + line += '%7s' % (msg) + for i in intervals: + bytes = id_freed_interval[qid][i] + if bytes != 0: + msg = '%.1f' % (bytes*1e-3) + else: + msg = '' + line += '%7s' % (msg) + print(line.rstrip()) + if id_last_freed: + rec_free = max(pkt['free_tx_skb'] for pkt in id_last_freed.values()) + freed = '%.3f' % (rec_free) + free_diff = '%.1f' % (options.time - rec_free) + else: + freed = '' + free_diff = '' + print('Total %4d %6.1f %9.3f %7.1f %9s %7s' % (total_pkts, total_bytes*1e-3, + nic_oldest['nic'], options.time - nic_oldest['nic'], + freed, free_diff), end='') + for i in intervals: + print('%7.1f' % ( + sum(q[i] for q in id_queued_interval.values())*1e-3), + end='') + for i in intervals: + print('%7.1f' % ( + sum(q[i] for q in id_freed_interval.values())*1e-3), + end='') + print() + + if options.verbose: + print('\nDetails for all of the packets owned by the NIC at time %.1f:' + % (options.time)) + all_active.sort(key=lambda pkt: [pkt['tx_qid'], pkt['nic']]) + print(print_pkts(all_active, header=True), end='') + +#------------------------------------------------ +# Analyzer: nictx +#------------------------------------------------ +class AnalyzeNictx: + """ + Analyze various factors related to NIC queueing and throughput for packet + transmission; generate graphs displaying these factors. Requires the + --plot option. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeTcppackets') + require_options('nictx', 'plot') + + def output(self): + global packets, grants, tcp_packets, options, traces + + # node -> list of packets transmitted by that node + node_pkts = defaultdict(list) + + # Bucket all of the packets by transmitting node. + type_counts = defaultdict(lambda: 0) + for pkt in itertools.chain(packets.values(), tcp_packets.values(), + grants.values()): + if not pkt['tx_node'] or not 'tx_qid' in pkt: + continue + if pkt['type'] == 'grant': + length = 0 + elif not 'tso_length' in pkt: + continue + node = pkt['tx_node'] + if 'nic' in pkt or 'free_tx_skb' in pkt: + node_pkts[node].append(pkt) + type_counts[pkt['type']] += 1 + + # List of tuples, one for + # each interval across all nodes: + # queues: The number of tx queues in the NIC with packets at the + # start of the interval + # bytes: The number of packet bytes queued in the NIC at the start + # of the interval + # pkts: The total number of queued packets at the start of the + # interval + # freed: The number of packet bytes freed after transmission during + # the interval + # queued: The number of new packet bytes queued during the interval + # qdisc: The number of bytes in packets that are queued in the + # qdisc system (they have been passed to ip*xmit but have + # not been handed off to the NIC) + intervals = [] + + # node -> dict containing data series for plotting: + # t: list of time values for the other data series + # qdisc: for each t, kbytes queued in qdiscs or NIC at t + # nic: for each t, kbytes queued in the NIC at t + # maxq: for each t, kbytes queued in the longest NIC queue at t + node_data = defaultdict(lambda: {'t': [], 'qdisc': [], 'nic': [], + 'maxq': []}) + + # Process the packets in each node separately in order to populate + # intervals and node_data. + for node in get_sorted_nodes(): + # List of , where time is the time when an + # event occurred, type indicates what happened ('xmit', + # 'nic', or 'freed'), and pkt is the packet for which the + # particular event occurred. The list is eventually sorted + # in time order. + events = [] + + # Generate list of interesting events. + first_time = traces[node]['first_time'] + last_time = traces[node]['last_time'] + for pkt in node_pkts[node]: + events.append([pkt['xmit'] if 'xmit' in pkt else first_time, + 'xmit', pkt]) + events.append([pkt['nic'] if 'nic' in pkt else first_time, + 'nic', pkt]) + events.append([pkt['free_tx_skb'] if 'free_tx_skb' in pkt + else last_time, 'freed', pkt]) + events.sort(key=lambda t: t[0]) + + # Process the events in time order to generate statistics. + + # qid -> count of packets currently owned by this queue. + qid_packets = defaultdict(lambda: 0) + + # qid -> count of bytes currently owned by this queue. + qid_bytes = defaultdict(lambda: 0) + + # Total bytes currently owned by NIC + nic_bytes = 0 + + # Total packets currently owned by NIC + nic_pkts = 0 + + # Total bytes that have been passed to ip*xmit but have not + # yet been queued in the NIC (they are queued in the qdisc system). + qdisc_bytes = 0 + + # The next tuple that will be added to intervals. + next = [0, 0, 0, 0, 0, 0] + + # True means there was at least one point in the current interval + # where the total # of bytes queued in the NIC dropped below + # a threshold value. + below_threshold = False + + t = traces[node]['first_time'] + interval_end = (math.ceil(first_time / options.interval) * + options.interval) + + data = node_data[node] + + for t, event, pkt in events: + # Handle end of interval(s) + while t >= interval_end: + if first_time <= (interval_end - options.interval): + gbps_in = next[4] * 8e-3 / options.interval + gbps_out = next[3] * 8e-3 / options.interval + if 0 and gbps_in > 100: + print('%9.1f %s has %.1f KB queued data, tput %.1f ' + 'Gbps, input %.1f Gbps' + % (interval_end, node, nic_bytes * 1e-3, + gbps_out, gbps_in)) + if not below_threshold: + intervals.append(next) + data['t'].append(interval_end) + data['qdisc'].append((qdisc_bytes + nic_bytes) * 1e-3) + data['nic'].append(nic_bytes * 1e-3) + data['maxq'].append(max(qid_bytes.values()) * 1e-3) + active_queues = sum(n > 0 for n in qid_packets.values()) + next = [active_queues, nic_bytes, nic_pkts, 0, 0, + qdisc_bytes] + below_threshold = False + interval_end += options.interval + + # Process event + qid = pkt['tx_qid'] + if pkt['type'] == 'grant': + length = get_hdr_length(pkt) + else: + length = pkt['tso_length'] + get_hdr_length(pkt) + if event == 'xmit': + qdisc_bytes += length + elif event == 'nic': + qid_packets[qid] += 1 + qid_bytes[qid] += length + nic_pkts += 1 + nic_bytes += length + qdisc_bytes -= length + if 'nic' in pkt: + next[4] += length + elif event == 'freed': + qid_packets[qid] -= 1 + qid_bytes[qid] -= length + nic_pkts -= 1 + nic_bytes -= length + # if nic_bytes < 1000000: + # below_threshold = True + next[3] += length + else: + raise Exception('unknown event type %s' % event) + + # Plot throughput vs. number of active queues + xmax = 30 + x = range(xmax) + buckets = [[] for _ in x] + for queues, bytes, pkts, freed, queued, qdisc in intervals: + if queues < xmax: + buckets[queues].append(freed * 8e-3 / options.interval) + y = [] + yerr = [] + for bucket in buckets: + if not bucket: + y.append(0) + yerr.append(0) + continue + y.append(sum(bucket) / len(bucket)) + b = sorted(bucket) + if len(bucket) >= 2: + yerr.append(stdev(bucket)) + else: + yerr.append(0) + + fig = plt.figure(figsize=[6,4]) + ax = fig.add_subplot(111) + ax.set_xlim(0, xmax) + ax.set_xlabel('# Nic Queues Occupied') + ax.set_ylim(0, 120) + ax.set_ylabel('Tx Completion Rate(Gbps)') + ax.errorbar(x, y, yerr=yerr, fmt='o', capsize=4) + plt.tight_layout() + plt.savefig('%s/nictx_vs_queues.pdf' % (options.plot)) + + # Plot throughput vs. Kbytes owned by NIC + xmax = 2500 + bucket_size = 100 + x = range(0, xmax, bucket_size) + buckets = [[] for _ in x] + for queues, bytes, pkts, freed, queued, qdisc in intervals: + kb = bytes//1000 + if kb < xmax: + buckets[kb//bucket_size].append(freed * 8e-3 / options.interval) + y = [] + yerr = [] + for bucket in buckets: + if not bucket: + y.append(0) + yerr.append(0) + continue + y.append(sum(bucket) / len(bucket)) + if len(bucket) >= 2: + yerr.append(stdev(bucket)) + else: + yerr.append(0) + + fig = plt.figure(figsize=[6,4]) + ax = fig.add_subplot(111) + ax.set_xlim(0, xmax) + ax.set_xlabel('KBytes in NIC Queues') + ax.set_ylim(0, 120) + ax.set_ylabel('Tx Completion Rate(Gbps)') + ax.errorbar(x, y, yerr=yerr, fmt='o', capsize=4) + plt.tight_layout() + plt.savefig('%s/nictx_vs_kb.pdf' % (options.plot)) + + # Plot throughput vs. packets owned by NIC + xmax = 30 + x = range(xmax) + buckets = [[] for _ in x] + for queues, bytes, pkts, freed, queued, qdisc in intervals: + if pkts < xmax: + buckets[pkts].append(freed * 8e-3 / options.interval) + y = [] + yerr = [] + for bucket in buckets: + if not bucket: + y.append(0) + yerr.append(0) + continue + y.append(sum(bucket) / len(bucket)) + if len(bucket) >= 2: + yerr.append(stdev(bucket)) + else: + yerr.append(0) + + fig = plt.figure(figsize=[6,4]) + ax = fig.add_subplot(111) + ax.set_xlim(0, xmax) + ax.set_xlabel('# Packets Owned by NIC') + ax.set_ylim(0, 120) + ax.set_ylabel('Tx Completion Rate(Gbps)') + ax.errorbar(x, y, yerr=yerr, fmt='o', capsize=4) + plt.tight_layout() + plt.savefig('%s/nictx_vs_pkts.pdf' % (options.plot)) + + # Generate CDF of throughput for intervals. + tput = [(i[3] * 8e-3 / options.interval) for i in intervals] + tput.sort() + y = [i / len(tput) for i in range(len(tput))] + fig = plt.figure(figsize=[6,4]) + ax = fig.add_subplot(111) + ax.set_xlim(0, 120) + ax.set_xlabel('Tx Completion Rate (Gbps)') + ax.set_ylim(0, 1.0) + ax.set_ylabel('Fraction of %d μs Intervals' % options.interval) + plt.grid(which="major", axis="y") + plt.grid(which="major", axis="x") + plt.plot(tput, y) + plt.tight_layout() + plt.savefig('%s/nictx_tput_cdf.pdf' % (options.plot)) + + # Generate CDF of active queues for intervals. + active = [i[0] for i in intervals] + active.sort() + y = [i / len(active) for i in range(len(active))] + fig = plt.figure(figsize=[6,4]) + ax = fig.add_subplot(111) + ax.set_xlim(0, 20) + ax.xaxis.set_major_locator(matplotlib.ticker.MultipleLocator(2)) + ax.set_xlabel('Active NIC queues') + ax.set_ylim(0, 1.0) + ax.set_ylabel('Fraction of %d μs Intervals' % options.interval) + plt.grid(which="major", axis="y") + plt.grid(which="major", axis="x") + plt.plot(active, y) + plt.tight_layout() + plt.savefig('%s/nictx_queues_cdf.pdf' % (options.plot)) + + # Generate CDF of KBytes in queued packets. + kb = [i[1] * 1e-3 for i in intervals] + kb.sort() + y = [i / len(kb) for i in range(len(kb))] + fig = plt.figure(figsize=[6,4]) + ax = fig.add_subplot(111) + ax.set_xlim(0, kb[95*len(kb)//100]) + ax.set_xlabel('KBytes in Queued Packets') + ax.set_ylim(0, 1.0) + ax.set_ylabel('Fraction of %d μs Intervals' % options.interval) + plt.grid(which="major", axis="y") + plt.grid(which="major", axis="x") + plt.plot(kb, y) + plt.tight_layout() + plt.savefig('%s/nictx_kb_cdf.pdf' % (options.plot)) + + # Generate CDF of KBytes in packets queued in a qdisc. + kb = [i[5] * 1e-3 for i in intervals] + kb.sort() + y = [i / len(kb) for i in range(len(kb))] + fig = plt.figure(figsize=[6,4]) + ax = fig.add_subplot(111) + ax.set_xlim(0, 5000) + ax.set_xlabel('Kbytes in Packets Queued in a Qdisc') + ax.set_ylim(0, 1.0) + ax.set_ylabel('Fraction of %d μs Intervals' % options.interval) + plt.grid(which="major", axis="y") + plt.grid(which="major", axis="x") + plt.plot(kb, y) + plt.tight_layout() + plt.savefig('%s/nictx_qdisc_cdf.pdf' % (options.plot)) + + # Generate CDF of queued packets for intervals. + pkts = [i[2] for i in intervals] + pkts.sort() + y = [i / len(pkts) for i in range(len(pkts))] + fig = plt.figure(figsize=[6,4]) + ax = fig.add_subplot(111) + ax.set_xlim(0, 50) + ax.set_xlabel('Packets Queued in NIC') + ax.set_ylim(0, 1.0) + ax.set_ylabel('Fraction of %d μs Intervals' % options.interval) + plt.grid(which="major", axis="y") + plt.grid(which="major", axis="x") + plt.plot(pkts, y) + plt.tight_layout() + plt.savefig('%s/nictx_pkts_cdf.pdf' % (options.plot)) + + # Generate CDF of input to the NIC for intervals. + input = [(i[4] * 8e-3 / options.interval) for i in intervals] + input.sort() + y = [i / len(input) for i in range(len(input))] + fig = plt.figure(figsize=[6,4]) + ax = fig.add_subplot(111) + ax.set_xlim(0, 120) + ax.set_xlabel('Rate of New Bytes Queued in NIC (Gbps)') + ax.set_ylim(0, 1.0) + ax.set_ylabel('Fraction of %d μs Intervals' % options.interval) + plt.grid(which="major", axis="y") + plt.grid(which="major", axis="x") + plt.plot(input, y) + plt.tight_layout() + plt.savefig('%s/nictx_input_cdf.pdf' % (options.plot)) + + # Generate time-series plot showing queuing in the qdisc and NIC + # for each node. + x_min = get_first_time() + x_max = get_last_time() + nodes = get_sorted_nodes() + maxy = max(max(node_data[node]['qdisc']) for node in nodes) + fig, axes = plt.subplots(nrows=len(nodes), ncols=1, sharex=False, + figsize=[8, len(nodes)*2]) + for i in range(len(nodes)): + node = nodes[i] + ax = axes[i] + ax.set_xlim(x_min, x_max) + ax.set_xlabel('Time (%s)' % (node)) + ax.set_ylim(0, maxy) + ax.set_ylabel('Kbytes Queued') + ax.grid(which="major", axis="y") + ax.plot(node_data[node]['t'], node_data[node]['qdisc'], + color=color_blue, label='Nic + Qdisc') + ax.plot(node_data[node]['t'], node_data[node]['nic'], + color=color_red, label='Nic') + legend_handles = [ + matplotlib.lines.Line2D([], [], color=c, marker='o', + linestyle='None', markersize=8, label=label) + for c, label in [[color_blue, 'Nic + Qdisc'], + [color_red, 'Nic']] + ] + fig.legend(handles=legend_handles) + plt.tight_layout() + plt.savefig("%s/nictx_qtrend.pdf" % (options.plot), bbox_inches='tight') + + # Generate time-series plot showing length of the longest NIC queue + # for each node + x_min = get_first_time() + x_max = get_last_time() + nodes = get_sorted_nodes() + maxy = max(max(node_data[node]['maxq']) for node in nodes) + fig, axes = plt.subplots(nrows=len(nodes), ncols=1, sharex=False, + figsize=[8, len(nodes)*2]) + for i in range(len(nodes)): + node = nodes[i] + ax = axes[i] + ax.set_xlim(x_min, x_max) + ax.set_xlabel('Time (%s)' % (node)) + ax.set_ylim(0, maxy) + ax.set_ylabel('Longest NIC Queue (KB)') + ax.grid(which="major", axis="y") + ax.plot(node_data[node]['t'], node_data[node]['maxq'], + color=color_blue) + plt.tight_layout() + plt.savefig("%s/nictx_maxqtrend.pdf" % (options.plot), bbox_inches='tight') + + print('\n---------------') + print('Analyzer: nictx') + print('---------------') + print('Analyzed %d Homa data packets, %d Homa grants, %d ' + 'TCP packets.' % (type_counts['data'], type_counts['grant'], + type_counts['tcp'])) + print('Analyzed %d intervals of length %d usecs (use --interval ' + 'option to' % (len(intervals), options.interval)) + print('change interval length).') + print('See files %s/nictx_*.pdf for plots.' % (options.plot)) + +#------------------------------------------------ +# Analyzer: ooo +#------------------------------------------------ +class AnalyzeOoo: + """ + Prints statistics about out-of-order packet arrivals within a message. + Also prints details about out-of-order packets in the RPCs that + experienced the highest out-of-order delays (--verbose will print info + for all OOO RPCs). + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeRpcs') + + def output(self): + global rpcs, options + + total_rpcs = 0 + total_packets = 0 + ooo_packets = 0 + + # Each element of this list contains a tuple describing + # all of the out-of-order packets in a single RPC: delay is the + # maximum delay experienced by any of the out-of-order packets, and + # info contains one or more lines of text, each line describing one + # ooo packet. + ooo_rpcs = [] + + # Each element of this list represents one RPC whose completion + # was delayed by ooo packets (i.e. the last packet received didn't + # contain the last bytes of the message). Each element is a tuple + # : + # delay: time between the arrival of the packet containing the + # last bytes of the message and the arrival of the last + # packet + # id: RPC identifier + # count: the number of packets that arrived after the one containing + # the last bytes of the message + delayed_msgs = [] + + # Scan the incoming packets in each RPC. + for id, rpc in rpcs.items(): + if not 'gro_data' in rpc: + continue + total_rpcs += 1 + pkts = rpc['gro_data'] + total_packets += len(pkts) + highest_index = -1 + highest_offset = -1 + highest_offset_time = 0 + last_time = 0 + packets_after_highest = 0 + highest_prio = 0 + max_delay = -1 + info = '' + for i in range(len(pkts)): + time, offset, prio = pkts[i] + last_time = time + if offset > highest_offset: + highest_index = i; + highest_offset = offset + highest_offset_time = time + highest_prio = prio + packets_after_highest = 0 + continue + else: + packets_after_highest += 1 + + # This packet is out of order. Find the first packet received + # with higher offset than this one so we can compute how long + # this packet was delayed. + ooo_packets += 1 + gap = highest_index + while gap > 0: + if pkts[gap-1][1] < offset: + break + gap -= 1 + gap_time, gap_offset, gap_prio = pkts[gap] + delay = time - gap_time + if max_delay == -1: + rpc_id = '%12d' % (id) + else: + rpc_id = ' ' * 12 + info += '%s %7d %10s %9.3f %7.1f %8d %3d %3d\n' % (rpc_id, offset, + rpc['node'], time, delay, highest_offset - offset, + prio, highest_prio) + if delay > max_delay: + max_delay = delay + if info: + ooo_rpcs.append([max_delay, info]) + if packets_after_highest > 0: + delayed_msgs.append([last_time - highest_offset_time, id, + packets_after_highest]) + + print('\n-----------------') + print('Analyzer: ooo') + print('-----------------') + print('Messages with out-of-order packets: %d/%d (%.1f%%)' % + (len(ooo_rpcs), total_rpcs, 100.0*len(ooo_rpcs)/total_rpcs)) + print('Out-of-order packets: %d/%d (%.1f%%)' % + (ooo_packets, total_packets, 100.0*ooo_packets/total_packets)) + if delayed_msgs: + delayed_msgs.sort() + print('') + print('Messages whose completion was delayed by out-of-order-packets: ' + '%d (%.1f%%)' % (len(delayed_msgs), + 100.0*len(delayed_msgs)/len(rpcs))) + print('P50 completion delay: %.1f us' % ( + delayed_msgs[len(delayed_msgs)//2][0])) + print('P90 completion delay: %.1f us' % ( + delayed_msgs[(9*len(delayed_msgs))//10][0])) + print('Worst delays:') + print('Delay (us) RPC Receiver Late Pkts') + for i in range(len(delayed_msgs)-1, len(delayed_msgs)-6, -1): + if i < 0: + break; + delay, id, pkts = delayed_msgs[i] + print(' %8.1f %10d %10s %5d' % + (delay, id, rpcs[id]['node'], pkts)) + + delayed_msgs.sort(key=lambda t : t[2]) + packets_sum = sum(i[2] for i in delayed_msgs) + print('Late packets per delayed message: P50 %.1f, P90 %.1f, Avg %.1f' % + (delayed_msgs[len(delayed_msgs)//2][2], + delayed_msgs[(9*len(delayed_msgs))//10][2], + packets_sum / len(delayed_msgs))) + else: + print('No RPCs had their completion delayed by out-of-order packtets') + + if not ooo_rpcs: + return + print('') + print('Information about out-of-order packets, grouped by RPC and sorted') + print('so that RPCs with largest OOO delays appear first (use --verbose') + print('to display all RPCs with OOO packets):') + print('RPC: Identifier for the RPC') + print('Offset: Offset of the out-of-order packet within the RPC') + print('Node: Node on which the packet was received') + print('Time: Time when the packet was received by homa_gro_receive') + print('Delay: Time - receive time for earliest packet with higher offset') + print('Gap: Offset of highest packet received before this one, minus') + print(' offset of this packet') + print('Prio: Priority of this packet') + print('Prev: Priority of the highest-offset packet received before ') + print(' this one') + print('') + print(' RPC Offset Node Time Delay Gap Prio Prev') + print('--------------------------------------------------------------------') + ooo_rpcs.sort(key=lambda t : t[0], reverse=True) + count = 0 + for delay, info in ooo_rpcs: + if (count >= 20) and not options.verbose: + break + print(info, end='') + count += 1 + +#------------------------------------------------ +# Analyzer: p99short +#------------------------------------------------ +class AnalyzeP99short: + """ + Selects the 1% of short RPCs (those with single-packet request and + response messages) with highest RTT and breaks down the delay both + for the overall RPCs and for their constituent packets. + """ + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzePackets') + + def output(self): + global rpcs + + # tuples for all short rpcs. + short_rpcs = [] + + for rpc in rpcs.values(): + # Select only client RPCs, and make sure there is complete + # information for each RPC. + if rpc['id'] & 0x1: + continue + if not 'sendmsg' in rpc or not 'recvmsg_done' in rpc: + continue + peer = get_rpc_node(rpc['id'] ^ 1) + if not peer: + continue + if rpc['sendmsg'] < traces[peer]['first_time']: + continue + if rpc['recvmsg_done'] > traces[peer]['last_time']: + continue + if len(rpc['send_data_pkts']) != 1: + continue + if len(rpc['softirq_data_pkts']) != 1: + continue + short_rpcs.append([rpc['recvmsg_done'] - rpc['sendmsg'], rpc]) + + print('\n------------------') + print('Analyzer: p99short') + print('------------------') + + if not short_rpcs: + print('Couldn\'t find any single-packet RPCs') + return + short_rpcs.sort(key=lambda t: t[0]) + packets = [] + slow_rpcs = [] + for rtt, rpc in reversed(short_rpcs[99*len(short_rpcs)//100:]): + slow_rpcs.append(rpc) + packets.append(rpc['send_data_pkts'][0]) + packets.append(rpc['softirq_data_pkts'][0]) + print('The slowest 1% of short RPCs (those with a single packet for ' + 'request and') + print('response):') + print(print_rpcs(slow_rpcs, header=True), end='') + print('\nPackets from the slow RPCs:') + print(print_pkts(packets, header=True), end='') + +#------------------------------------------------ +# Analyzer: packet +#------------------------------------------------ +class AnalyzePacket: + """ + Analyzes the delay between when a particular packet was sent and when + it was received by GRO: prints information about other packets competing + for the same GRO core. Must specify the packet of interest with the + '--pkt ID:offset' option: this is the packet id on the sender. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzePackets') + return + + def output(self): + global rpcs, traces, options, ip_to_node, packets + + print('\n-----------------') + print('Analyzer: packet') + print('-----------------') + if not options.pkt: + print('Skipping packet analyzer: --pkt not specified', + file=sys.stderr) + return + + if not options.pkt in packets: + print('Can\'t find packet %s' % (options.pkt), + file=sys.stderr) + return + pkt = packets[options.pkt] + for field in ['gro', 'priority', 'xmit']: + if not field in pkt: + print('Packet %s doesn\'t have a "%s" field' % (options.pkt, + field), file=sys.stderr) + return + xmit_time = pkt['xmit'] + xmit_id = pkt['id'] + recv_time = pkt['gro'] + rx_node = pkt['rx_node'] + gro_core = pkt['gro_core'] + tx_node = pkt['tx_node'] + + print('Packet: RPC id %d, offset %d, delay %6.1f us' % (xmit_id, + options.pkt_offset, recv_time - xmit_time)) + print('%.3f: Packet passed to ip*xmit on %s, core %d' % (xmit_time, + tx_node, pkt['tx_core'])) + if 'nic' in pkt: + print('%.3f: Packet handed off to NIC on %s' % (pkt['nic'], + tx_node)) + if 'free_tx_skb' in pkt: + print('%.3f: sk_buff returned by NIC and freed' % ( + pkt['free_tx_skb'])) + print('%.3f: Packet received by %s on core %d with priority %d' + % (recv_time, pkt['rx_node'], pkt['gro_core'], pkt['priority'])) + + # List of packets received by rx_node whose lifetimes overlap + # the reference packet. + pkts = [] + + # Amount of data already in transit to target at the time reference + # packet was transmitted. + prior_bytes = 0 + prior_pkts = 0 + + for p in packets.values(): + if not 'gro' in p: + continue + if p['gro'] < xmit_time: + continue + if p['rx_node'] != rx_node: + continue + if p is pkt: + continue + if 'xmit' in p: + if p['xmit'] >= recv_time: + continue + if p['xmit'] <= xmit_time: + prior_bytes += p['length'] + prior_pkts += 1 + elif p['gro'] >= recv_time: + continue + pkts.append(p) + + # Amount of data transmitted after the reference packet but received + # on the reference packet's core before the reference packet. + after_core_bytes = 0 + after_core_pkts = 0 + + # Amount of data transmitted after the reference packet but received + # on other cores before the reference packet. + after_other_bytes = 0 + after_other_pkts = 0 + + # Create output messages grouped into categories. + pkts.sort(key=lambda p : p['gro']) + before_before = '' + before_after = '' + after_before_core = '' + after_before_other = '' + after_after = '' + unknown_before = '' + for p in pkts: + if not 'xmit' in p: + sender = "" + if p['id'] in rpcs: + sender = rpcs[p['id']]['node'] + unknown_before += ('\n ??? %9.3f ??? %11d %7d ' + '%-10s %4s %6d %2d %4d' % (p['gro'], p['id'], + p['offset'], sender, "", p['length'], + p['priority'], p['gro_core'])) + continue + msg = '\n%9.3f %9.3f %8.1f %11d %7d %-10s %4s %6d %2d %4d' % ( + p['xmit'], p['gro'], p['gro'] - p['xmit'], p['id'], + p['offset'], p['tx_node'], p['tx_core'], p['length'], + p['priority'], p['gro_core']) + if p['xmit'] < xmit_time: + if p['gro'] < recv_time: + before_before += msg + else: + before_after += msg + else: + if p['gro'] < recv_time: + if p['gro_core'] == gro_core: + after_before_core += msg + after_core_bytes += p['length'] + after_core_pkts += 1 + else: + after_before_other += msg + after_other_bytes += p['length'] + after_other_pkts += 1 + else: + after_after += msg + + print('%.1f KB (%d packets) already in transit to %s when packet ' + 'transmitted' % (prior_bytes * 1e-3, prior_pkts, tx_node)) + print(' (%.1f us at %.0f Gbps)' % ( + bytes_to_usec(prior_bytes), options.gbps)) + print('%.1f KB (%d packets) transmitted to core %d after packet was ' + 'transmitted but' % (after_core_bytes * 1e-3, after_core_pkts, + gro_core)) + print(' received before packet (%.1f us at %.0f Gbps)' + % (bytes_to_usec(after_core_bytes), options.gbps)) + print('%.1f KB (%d packets) transmitted to other cores after packet ' + 'was' % (after_other_bytes * 1e-3, + after_other_pkts)) + print(' transmitted but received before packet (%.1f us ' + 'at %.0f Gbps)' % (bytes_to_usec(after_other_bytes), + options.gbps)) + print('\nOther packets whose transmission to %s overlapped this ' + 'packet:' % (rx_node)) + print('Xmit: Time packet was transmitted') + print('Recv: Time packet was received on core %d' % (gro_core)) + print('Delay: End-to-end latency for packet') + print('Rpc: Id of packet\'s RPC (on sender)') + print('Offset: Offset of packet within message') + print('Sender: Node that sent packet') + print('TxCore: Core on which packet was passed to ip*xmit') + print('Length: Number of message bytes in packet') + print('Prio: Priority at which packet was transmitted') + print('Core: Core on which homa_gro_receive handled packet') + hdr = ' Xmit Recv Delay Rpc Offset Sender ' \ + 'TxCore Length Prio Core\n' \ + '------------------------------------------------------------' \ + '-------------------------' + if before_before: + print('\nSent before %s, received before:\n%s\n%s' % + (options.pkt, hdr, before_before)) + if before_after: + print('\nSent before %s, received after:\n%s\n%s' % + (options.pkt, hdr, before_after)) + if after_before_core: + print('\nSent after %s, received on core %d before:\n%s\n%s' % + (options.pkt, gro_core, hdr, after_before_core)) + if after_before_other: + print('\nSent after %s, received on other cores before:\n%s\n%s' % + (options.pkt, hdr, after_before_other)) + if after_after: + print('\nSent after %s, received after:\n%s\n%s' % + (options.pkt, hdr, after_after)) + if unknown_before: + print('\nSend time unknown, received before:\n%s\n%s' % (hdr, + unknown_before)) + +#------------------------------------------------ +# Analyzer: packets +#------------------------------------------------ +class AnalyzePackets: + """ + Collects information about each data packet and grant but doesn't + generate any output. The data it collects is used by other analyzers. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeRpcs') + + # offset -> Largest length that has occurred for that offset in a + # TSO packet. Used to compute tso_offset field if it is + # missing + self.tso_lengths = defaultdict(lambda : -1) + return + + def init_trace(self, trace): + # RPC id -> list of live data packets for that RPC (packets that + # have been received by homa_gro_receive but not yet copied to user + # space). + self.live = defaultdict(list) + + # Core -> list of packets that have been copied out to user space by + # that core (but not yet freed). + self.copied = defaultdict(list) + + def tt_ip_xmit(self, trace, t, core, id, offset): + global packets, rpcs + p = packets[pkt_id(id, offset)] + p['tx_node'] = trace['node'] + if not p['retransmits']: + p['xmit'] = t + p['tx_core'] = core + rpcs[id]['send_data_pkts'].append(p) + else: + p['retransmits'][-1]['xmit'] = t + + def tt_nic_data(self, trace, t, core, peer, id, offset, tx_queue): + global packets + p = packets[pkt_id(id, offset)] + p['tx_node'] = trace['node'] + if not p['retransmits']: + p['nic'] = t + p['tx_queue'] = tx_queue + else: + p['retransmits'][-1]['nic'] = t + + def tt_free_tx_skb(self, trace, t, core, id, offset, qid, msg_length): + global packets + p = packets[pkt_id(id, offset)] + p['tx_node'] = trace['node'] + if not p['retransmits']: + p['free_tx_skb'] = t + p['tx_qid'] = qid + p['msg_length'] = msg_length + else: + p = p['retransmits'][-1] + p['free_tx_skb'] = t + + def tt_gro_data(self, trace, t, core, peer, id, offset, prio): + global packets, recv_offsets, rpcs + p = packets[pkt_id(id^1, offset)] + if not 'gro' in p: + rpcs[id]['gro_data_pkts'].append(p) + p['gro'] = t + p['priority'] = prio + p['gro_core'] = core + p['rx_node'] = trace['node'] + recv_offsets[offset] = True + self.live[id^1].append(p) + + def tt_softirq_data(self, trace, t, core, id, offset, msg_length): + global packets, rpcs + p = packets[pkt_id(id^1, offset)] + if not 'softirq' in p: + rpcs[id]['softirq_data_pkts'].append(p) + p['softirq'] = t + p['softirq_core'] = core + p['msg_length'] = msg_length + p['rx_node'] = trace['node'] + + def tt_copy_out_done(self, trace, t, core, id, start, end): + pkts = self.live[id^1] + for i in range(len(pkts) -1, -1, -1): + p = pkts[i] + if (p['offset'] >= start) and (p['offset'] < end): + p['copied'] = t + self.copied[core].append(p) + pkts.pop(i) + + def tt_free_skbs(self, trace, t, core, num_skbs): + for p in self.copied[core]: + p['free'] = t + self.copied[core] = [] + + def tt_send_data(self, trace, t, core, id, offset, length): + global packets + p = packets[pkt_id(id, offset)] + if not p['retransmits']: + if length > self.tso_lengths[offset]: + self.tso_lengths[offset] = length + p['tso_length'] = length + else: + p['retransmits'][-1]['tso_length'] = length + if id == 202753545: + print("tt_send_data got packet: %s" % (p)) + + def tt_pacer_xmit(self, trace, t, core, id, offset, port, bytes_left): + global packets + p = packets[pkt_id(id, offset)] + if p['retransmits']: + p = p['retransmits'][-1] + p['pacer'] = True + + def tt_qdisc_defer(self, trace, t, core, id, offset): + global packets + p = packets[pkt_id(id, offset)] + p['tx_node'] = trace['node'] + if p['retransmits']: + p = p['retransmits'][-1] + p['qdisc_defer'] = t + + def tt_qdisc_xmit(self, trace, t, core, id, offset): + global packets + p = packets[pkt_id(id, offset)] + p['tx_node'] = trace['node'] + if p['retransmits']: + p = p['retransmits'][-1] + p['qdisc_xmit'] = t + + def tt_retransmit(self, trace, t, core, id, offset, length): + global packets + p = packets[pkt_id(id, offset)] + p['retransmits'].append({'retrans': t}) + + def tt_send_grant(self, trace, t, core, id, offset, priority, increment): + global grants, rpcs + g = grants[pkt_id(id, offset)] + if not 'xmit' in g: + rpcs[id]['send_grant_pkts'].append(g) + g['xmit'] = t + g['tx_node'] = trace['node'] + g['increment'] = increment + + def tt_nic_grant(self, trace, t, core, peer, id, offset, tx_queue): + global grants + g = grants[pkt_id(id, offset)] + g['nic'] = t + g['tx_node'] = trace['node'] + g['tx_queue'] = tx_queue + + def tt_free_grant(self, trace, t, core, id, offset, qid): + global grants + g = grants[pkt_id(id, offset)] + g['free_tx_skb'] = t + g['tx_qid'] = qid + g['tx_node'] = trace['node'] + + def tt_gro_grant(self, trace, t, core, peer, id, offset, priority): + global grants + g = grants[pkt_id(id^1, offset)] + if not 'gro' in g: + rpcs[id]['gro_grant_pkts'].append(g) + g['gro'] = t + g['gro_core'] = core + g['rx_node'] = trace['node'] + + def tt_softirq_grant(self, trace, t, core, id, offset, priority, increment): + global grants + g = grants[pkt_id(id^1, offset)] + if not 'softirq' in g: + rpcs[id]['softirq_grant_pkts'].append(g) + g['softirq'] = t + g['softirq_core'] = core + g['increment'] = increment + g['rx_node'] = trace['node'] + + def analyze(self): + """ + Try to deduce missing packet fields, such as message length. + """ + global packets, rpcs, grants + sync_error_printed = False + + missing_rpc = {'send_data': []} + new_pkts = [] + for pkt in packets.values(): + id = pkt['id'] + if id in rpcs: + tx_rpc = rpcs[id] + else: + tx_rpc = missing_rpc + if not 'msg_length' in pkt: + pkt['msg_length'] = None + if 'out_length' in tx_rpc: + pkt['msg_length'] = tx_rpc['out_length'] + elif id^1 in rpcs: + rx_rpc = rpcs[id^1] + if rx_rpc['in_length'] != None: + pkt['msg_length'] = rx_rpc['in_length'] + pkt['length'] = get_recv_length(pkt['offset'], pkt['msg_length']) + if ('xmit' in pkt) and ('gro' in pkt) and ( + (pkt['gro'] - pkt['xmit']) < -5.0): + if not sync_error_printed: + print('Timetraces don\'t appear to be synchronized ' + '(did you run ttsync.py?); packet arrived before ' + 'transmitted: %s' % (pkt), + file=sys.stderr) + sync_error_printed = True + + if not 'tso_length' in pkt: + offset = pkt['offset'] + if offset in self.tso_lengths: + tso_length = self.tso_lengths[offset] + msg_length = pkt['msg_length'] + if (msg_length != None) and ((offset + tso_length) > msg_length): + pkt['tso_length'] = msg_length - offset + else: + pkt['tso_length'] = tso_length + + if not 'tx_node' in pkt: + pkt['tx_node'] = get_rpc_node(id) + + if not 'rx_node' in pkt: + pkt['rx_node'] = get_rpc_node(id^1) + + if 'qdisc_xmit' in pkt: + pkt['xmit2'] = pkt['qdisc_xmit'] + elif ('xmit' in pkt) and (not 'qdisc_defer' in pkt): + pkt['xmit2'] = pkt['xmit'] + + # Make sure that all of the smaller packets deriving from each + # TSO packet are represented and properly populated (if one of + # these packets is lost it won't be represented yet). + if 'tso_length' in pkt: + tso_length = pkt['tso_length'] + if tso_length < pkt['length']: + pkt['length'] = tso_length + offset = pkt['offset'] + id = pkt['id'] + end = tso_length + offset + offset += get_recv_length(offset, end) + while offset < end: + pid = pkt_id(id, offset) + length = get_recv_length(offset, end) + if length == 0: + print('get_recv_length returned 0 length for offset ' + '%d for pkt %s' % (offset, pkt), file=sys.stderr) + break + if pid in packets: + pkt2 = packets[pid] + else: + pkt2 = {'offset': offset, 'length': length, + 'retransmits': []} + new_pkts.append([pid, pkt2]) + for key in ['xmit', 'qdisc_xmit', 'xmit2', 'nic', 'id', + 'msg_length', 'priority', 'tx_node', 'tx_core', + 'free_tx_skb', 'tx_qid', 'type']: + if key in pkt: + pkt2[key] = pkt[key] + if pkt2['msg_length'] != None and pkt2['offset'] > pkt2['msg_length']: + print('Bogus lengths in new packet: %s' % (pkt2)) + offset += length + if not 'segments' in pkt: + pkt['segments'] = [pkt2] + else: + pkt['segments'].append(pkt2) + for pid, pkt in new_pkts: + packets[pid] = pkt + + for pkt in grants.values(): + if not 'tx_node' in pkt: + pkt['tx_node'] = get_rpc_node(pkt['id']) + if not 'rx_node' in pkt: + pkt['rx_node'] = get_rpc_node(pkt['id']^1) + +#------------------------------------------------ +# Analyzer: pairs +#------------------------------------------------ +class AnalyzePairs: + """ + For each pair of nodes, outputs statistics about packet delays and + backlog as of the end of the traces. + """ + def __init__(self, dispatcher): + dispatcher.interest('AnalyzePackets') + + def output(self): + global traces, options, packets + print('\n-------------------') + print('Analyzer: pairs') + print('-------------------') + + # node -> dictionary mapping from node to the statistics about + # the node pair. + pairs = {} + backlog_time = get_first_end() + + for src in get_sorted_nodes(): + dsts = {} + for dst in get_sorted_nodes(): + if dst == src: + continue + dsts[dst] = {'delays': [], 'backlog': 0, 'xmit': 0} + pairs[src] = dsts + + for pkt in packets.values(): + if not 'nic' in pkt: + continue + if not pkt['tx_node']: + continue + if not pkt['rx_node']: + continue + src = pkt['tx_node'] + dst = pkt['rx_node'] + if pkt['nic'] >= traces[dst]['last_time']: + continue + if pkt['nic'] < traces[dst]['first_time']: + continue + info = pairs[pkt['tx_node']][pkt['rx_node']] + info['xmit'] += 1 + if 'gro' in pkt: + info['delays'].append(pkt['gro'] - pkt['nic']) + if not ('gro' in pkt) or (pkt['gro'] > backlog_time): + info['backlog'] += 1 + + print('Statistics about data packets sent between each distinct pair') + print('of nodes:') + print('Source: Node that transmitted packets') + print('Dest: Node to which packets were sent') + print('Xmits: Total number of packets sent from Source to Dest') + print('Backlog: Number of packets that had been sent but not recevied') + print(' as of the end of the traces (time %.1f)' % + (backlog_time)) + print('DelayP50: 10th percentile delay (usec NIC to GRO) for received pakcets') + print('DelayP50: 50th percentile delay (usec NIC to GRO) for received pakcets') + print('DelayP90: 90th percentile delay (usec NIC to GRO) for received pakcets') + print('DelayP99: 99th percentile delay (usec NIC to GRO) for received pakcets') + print() + print('Source Dest Xmits Backlog DelayP10 DelayP50 DelayP90 DelayP99') + first = True + for src in get_sorted_nodes(): + if not first: + print('') + for dst in get_sorted_nodes(): + if dst == src: + continue + info = pairs[src][dst] + delays = sorted(info['delays']) + if delays: + print('%6s %6s %6d %6d %7.1f %7.1f %7.1f %7.1f' % ( + src, dst, info['xmit'], info['backlog'], + delays[10*len(delays)//100], + delays[len(delays)//2], + delays[90*len(delays)//100], + delays[99*len(delays)//100])) + else: + print('%6s %6s %6d %6d %6d' % (src, dst, info['xmit'], + info['backlog'], len(delays))) + first = False + +#------------------------------------------------ +# Analyzer: pass +#------------------------------------------------ +class AnalyzePass: + """ + Compute statistics on "passing", where a packet A passes a packet B + if both are sent to the same destination and B was transmitted before + A, but A arrived before B. This information will indicate whether or + not priority queues are being used properly. If the --same-gro-core + option is specified, than packets must be processed by the same GRO + core at the destination in order to be considered for passing. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzePackets') + + def output(self): + global packets + + print('\n--------------') + print('Analyzer: pass') + print('--------------') + doc = 'Statistics on passing. A packet A has passed a packet B if A ' + doc += 'and B are sent to the same destination node and A was ' + doc += 'transmitted after B but arrived at GRO before B. ' + if options.same_gro_core: + doc += 'Since the --same-gro-core option was specified, A and B ' + doc += 'must also have been handled by the same GRO core at the ' + doc += 'destination. ' + doc += 'The term "gain" refers to the largest difference in ' + doc += 'transmission times between a packet and any of the packets ' + doc += 'it passed.' + print(textwrap.fill(doc, width=70)) + print('Node: Destination node for packets') + print('Packets: Total data packets sent to Node') + print('PassFrac: Fraction of packets that passed a lower-priority packet') + print('GainP50: 50th percentile gain of packets that passed a ' + 'lower-priority') + print(' packet (usecs)') + print('GainP90: 90th percentile gain of packets that passed a ' + 'lower-priority') + print(' packet (usecs)') + print('GainMax: Maximum gain of any packet that passed a ' + 'lower-priority') + print(' packet (usecs)') + print('RFrac: Fraction of packets that passed a higher-priority ' + 'packet ("reverse")') + print('RP50: 50th percentile gain of packets that passed a ' + 'higher-priority') + print(' packet (usecs)') + print('RP90: 90th percentile gain of packets that passed a ' + 'higher-priority') + print(' packet (usecs)') + print('RMax: Maximum gain of any packet that passed a ' + 'higher-priority packet') + print('\nNode Packets PassFrac GainP50 GainP90 GainMax RFrac RP50 RP90 RMax') + + # Node -> list of all data packets sent to that node. The list will + # eventually be sorted by packet transmission time (nic). + node_pkts = defaultdict(list) + + for pkt in packets.values(): + if not 'nic' in pkt or not 'gro' in pkt or not pkt['rx_node']: + continue + if not 'priority' in pkt: + continue + node_pkts[pkt['rx_node']].append(pkt) + for pkts in node_pkts.values(): + pkts.sort(key=lambda d: d['nic']) + for node in get_sorted_nodes(): + pkts = node_pkts[node] + + # Active packets (those that have been sent but have not yet been + # discovered to have been received), sorted in order of 'nic'. + active = deque() + + # For each packet that passed a lower-priority packet, this list + # contains one element (the "gain"), which is the largest + # difference in transmission times between the passing + # packet and any of the packets it passed. + gains = [] + + # Same as gains, except when a lower-priority packet passes + # a higher-priority one. + lgains = [] + + # Scan packets sent to the current node in order of 'nic' time, + # gathering data about inversions + for pkt in pkts: + nic = pkt['nic'] + gro = pkt['gro'] + priority = pkt['priority'] + gro_core = pkt['gro_core'] + + # Drop "active" packets that have completed. + while len(active) > 0 and active[0]['gro'] <= nic: + active.popleft() + + have_gain = False + have_lgain = False + for i, p2 in enumerate(active): + if gro >= p2['gro']: + continue + if options.same_gro_core and gro_core != p2['gro_core']: + continue + gain = nic - p2['nic'] + if p2['priority'] < priority: + if not have_gain: + # if node == 'node2': + # print('%9.3f -> %9.3f prio %d passed %9.3f-> ' + # '%9.3f prio %d, gain %.3f' % + # (nic, gro, priority, p2['nic'], p2['gro'], + # p2['priority'], gain)) + gains.append(gain) + have_gain = True + elif p2['priority'] > priority and not have_lgain: + # if gain > 89.0: + # print('%9.3f -> %9.3f prio %d passed %9.3f-> ' + # '%9.3f prio %d, gain %.3f' % + # (nic, gro, priority, p2['nic'], + # p2['gro'], p2['priority'], gain)) + lgains.append(gain) + have_lgain = True + active.append(pkt) + + # Print statistics + gains.sort() + num_passes = len(gains) + lgains.sort() + num_lpasses = len(lgains) + num_pkts = len(pkts) + print('%-10s %8d %6.3f %7.1f %7.1f %7.1f' % (node, num_pkts, + num_passes/num_pkts, gains[50*num_passes//100], + gains[90*num_passes//100], gains[-1]), end='') + print(' %6.3f%6.1f%6.1f %6.1f' % ( + num_lpasses/num_pkts, lgains[50*num_lpasses//100], + lgains[90*num_lpasses//100], lgains[-1])) + +#------------------------------------------------ +# Analyzer: qbytes +#------------------------------------------------ +class AnalyzeQbytes: + """ + Computes the amount of packet data of various kinds (Homa data, TCP + data, etc.) queued in the network at each point in time. Requires the + --plot option. + """ + def __init__(self, dispatcher): + require_options('qbytes', 'plot') + dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeTcppackets') + dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzeMinlatency') + dispatcher.interest('AnalyzeIntervals') + + def analyze(self): + """ + Computes interval fields related to queued data. + """ + global packets, rpcs, minlatency, intervals, grant_pkt_length + global tcp_packets + + for pkt_type, pkts in [['data', packets.values()], + ['grant', grants.values()], + ['tcp', tcp_packets.values()]]: + for pkt in pkts: + if not 'nic' in pkt or not 'gro' in pkt: + if pkt_type == 'tcp': + trace = None + if 'gro' in pkt: + t = pkt['gro'] + if pkt['saddr'] in ip_to_node: + trace = traces[ip_to_node[pkt['saddr']]] + else: + t = pkt['nic'] + if pkt['daddr'] in ip_to_node: + trace = traces[ip_to_node[pkt['daddr']]] + if (trace != None and trace['first_time'] < (t-1000) and + trace['last_time'] > (t+1000)): + print('%9.3f: incomplete TCP packet for %s:%d to %s:%d (peer %s, ' + 'start %.3f, end %.3f): %s' % + (t, pkt['saddr'], pkt['sport'], + pkt['daddr'], pkt['dport'], trace['node'], + trace['first_time'], trace['last_time'], + pkt)) + continue + gro = pkt['gro'] + nic = pkt['nic'] + tx_node = pkt['tx_node'] + rx_node = pkt['rx_node'] + + # The packet is assumed to be queued if its latency + # exceeds min_latency for the nodes; it is assumed to be + # queued for the last part of this time (that's not quite + # accurate since the queuing is probably in the switch and + # there is probably additional delay after the packet has + # been received but before GRO gets it). + q_start = nic + min_latency[tx_node][rx_node] + if q_start < gro: + if pkt_type == 'grant': + add_to_intervals(rx_node, q_start, gro, 'q_homa_grant', + grant_pkt_length) + elif pkt_type == 'data': + rpc = rpcs[pkt['id']^1] + length = pkt['length'] + data_hdr_length + if 'unsched' in rpc and pkt['offset'] < rpc['unsched']: + add_to_intervals(rx_node, q_start, gro, + 'q_homa_unsched', length) + else: + add_to_intervals(rx_node, q_start, gro, + 'q_homa_sched', length) + else: + add_to_intervals(rx_node, q_start, gro, 'q_tcp', + pkt['length'] + tcp_hdr_length) + + def init_axis(self, ax, x_min, x_max, y_max, size=10): + """ + Initialize an axis for plotting queued bytes. + """ + ax.set_xlim(x_min, x_max) + ax.set_ylim(0, y_max) + ax.tick_params(right=True, which='both', direction='in', length=5) + ax.set_xlabel('Time (μsec)', size=size) + ax.set_ylabel('Queued Data (KB)', size=size) + + def output(self): + global grants, options, packets, rpcs + nodes = get_sorted_nodes() + + print('\n----------------') + print('Analyzer: qbytes') + print('----------------') + print('See qbytes.pdf in %s' % (options.plot)) + + # Node -> dictionary with ready-to-plot data series for the node: + # grant, unsched, sched, and tcp. The data are cumulative: sched + # includes sched, unsched, and grant. Values correspond to time_data + node_data = defaultdict(lambda: {'grant': [], 'unsched': [], + 'sched': [], 'tcp': []}) + + # End-of-interval time values correspond to data points in node_data. + time_data = [] + + # node-> dictionary with maximum observed queuing across various + # categories + node_max = defaultdict(lambda:{'grant': 0, 'unsched': 0, 'sched': 0, + 'tcp': 0, 'total': 0}) + + # Largest 'total' value in dictionary above'. + overall_node_max = 0 + + # Ready-to-plot data series that hold totals across all nodes, + # corresponding to time_data. + total_grant_data = [] + total_unsched_data = [] + total_sched_data = [] + total_tcp_data = [] + + # Maximum values of sums across all nodes + max_grant = 0 + max_unsched = 0 + max_sched = 0 + max_tcp = 0 + max_total = 0 + + # Generate data to plot. Each iteration of this outer loop processes + # the interval data for all nodes at a given time. + t = options.interval * math.floor(get_last_start()/options.interval) + end_time = get_first_end() + while t < end_time: + total_grant = 0 + total_unsched = 0 + total_sched = 0 + total_tcp = 0 + for node in nodes: + data = node_data[node] + interval = get_interval(node, t) + max = node_max[node] + + val = interval['q_homa_grant'] + sum = val + data['grant'].append(sum/1000) + if val > max['grant']: + max['grant'] = val + total_grant += val + + val = interval['q_homa_unsched'] + sum += val + data['unsched'].append(sum/1000) + if val > max['unsched']: + max['unsched'] = val + total_unsched += val + + val = interval['q_homa_sched'] + sum += val + data['sched'].append(sum/1000) + if val > max['sched']: + max['sched'] = val + total_sched += val + + val = interval['q_tcp'] + sum += val + data['tcp'].append(sum/1000) + if val > max['tcp']: + max['tcp'] = val + total_tcp += val + + if sum > max['total']: + max['total'] = sum + if sum > overall_node_max: + overall_node_max = sum + + total_grant_data.append(total_grant/1000) + if total_grant > max_grant: + max_grant = total_grant + + sum = total_grant + total_unsched + total_unsched_data.append(sum/1000) + if total_unsched > max_unsched: + max_unsched= total_unsched + + sum += total_sched + total_sched_data.append(sum/1000) + if total_sched > max_sched: + max_sched= total_sched + + sum += total_tcp + total_tcp_data.append(sum/1000) + if total_tcp > max_tcp: + max_tcp= total_tcp + + if sum > max_total: + max_total = sum + time_data.append(t) + t += options.interval + + # Print summary statistics + print('\nLargest observed queued incoming data (KB):') + print('Node: Name of node') + print('Total: Maximum total queued bytes for the node') + print('Grants: Maximum queued bytes from grant packets') + print('Unsched: Maximum queued bytes in unscheduled data packets') + print('Sched: Maximum queued bytes in scheduled data packets') + print('Tcp: Maximum queued bytes in TCP packets\n') + print('The Total line shows the maximum instantaneous sum across all ' + 'nodes.') + print('Node Total Grants Unsched Sched Tcp') + for node in nodes: + max = node_max[node] + print('%-10s %8d %8d %8d %8d %8d' % (node, max['total']/1000, + max['grant']/1000, max['unsched']/1000, max['sched']/1000, + max['tcp']/1000)) + print('Total %8d %8d %8d %8d %8d' % (max_total/1000, max_grant/1000, + max_unsched/1000, max_sched/1000, max_tcp/1000)) + + # Generate a stacked graph. The top plot contains cluster-wide totals; + # subsequent plots show data for each individual node. + fig, axes = plt.subplots(nrows=len(nodes) + 1, ncols=1, sharex=False, + figsize=[8, (1 + len(nodes))*2]) + ax = axes[0] + ax.set_title("Total Across All Nodes", size=10) + x_min = get_last_start() + x_max = get_first_end() + self.init_axis(ax, x_min, x_max, max_total/1000) + ax.step(time_data, total_grant_data, where='pre', + label='Homa grants', color=color_red) + ax.step(time_data, total_unsched_data, where='pre', + label='Homa unscheduled data', color=color_blue) + ax.step(time_data, total_sched_data, where='pre', + label='Homa scheduled data', color=color_brown) + ax.step(time_data, total_tcp_data, where='pre', + label='TCP', color=color_green) + for i in range(len(nodes)): + node = nodes[i] + data = node_data[node] + ax = axes[i+1] + self.init_axis(ax, x_min, x_max, overall_node_max/1000) + ax.set_title(node, size=10) + ax.step(time_data, data['grant'], where='pre', color=color_red) + ax.step(time_data, data['unsched'], where='pre', color=color_blue) + ax.step(time_data, data['sched'], where='pre', color=color_brown) + ax.step(time_data, data['tcp'], where='pre', color=color_green) + fig.legend(loc='lower center', ncol=4, bbox_to_anchor=(0.5, -0.02), + frameon=False, prop={'size': 9}) + # plt.legend(loc="upper left", prop={'size': 9}) + plt.tight_layout() + plt.savefig("%s/qbytes.pdf" % (options.plot), bbox_inches='tight') + +#------------------------------------------------ +# Analyzer: qdelay +#------------------------------------------------ +class AnalyzeQdelay: + """ + Generates scatter plots that show the queuing delay for each packet + sent from (or received by) a given node. Queuing delay is the + actual latency for a packet (from when packet is queued for the NIC + until it is processed by GRO) minus the smallest latency observed for + the same source-destination pair. Queuing delay can be caused by either + queuing in the network or delays in invoking the GRO handler on the + destination. Requires the --plot option. + """ + + def __init__(self, dispatcher): + require_options('qdelay', 'plot') + dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzeMinlatency') + dispatcher.interest('AnalyzeIntervals') + + def init_qdelay_axis(self, ax, title, x_min, x_max, max_qdelay, size=10): + """ + Initializes a pyplot axis that will be used for a scatter plot of + queuing delay for each packet over time. + + ax: Axis to initialize + title: Title for the plot; may be empty + x_min: Lowest value for x-axis (usecs) + x_max: Highest value for x-axis (usecs) + max_qdelay: Largest value that will be displayed as y (queuing + delay in usecs). + size: Size to use for fonts + """ + global options + + if title != "": + ax.set_title(title, size=size) + ax.set_xlim(x_min, x_max) + ax.set_ylim(2, max_qdelay) + ax.set_yscale("log") + ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: '%d' % (x))) + ax.tick_params(right=True, which='both', direction='in', length=5) + ax.set_xlabel('Time of Packet Tx (μsec)', size=size) + ax.set_ylabel('Queuing Delay (μsec)', size=size) + + return ax + + def output(self): + global grants, options, packets, rpcs + nodes = get_sorted_nodes() + + # Node -> for all of the data packets + # received by the node. Time is a list of packet GRO times, qdelays + # is a list of corresponding queuing delays, and colors is a list + # of colors to use for each point. + rx_delays = defaultdict(lambda: [[], [], []]) + + # Same as rx_delays except that packets are those transmitted by + # the node and times are NIC times. + tx_delays = defaultdict(lambda: [[], [], []]) + + print('\n-----------------') + print('Analyzer: qdelay') + print('-----------------') + + print('See graphs qdelay_tx.pdf and qdelay_rx.pdf in %s' + % (options.plot)) + + # Collect data for the scatter plots. + overall_max_delay = 0 + for pkt_type, pkts in [['data', packets.values()], + ['grant', grants.values()]]: + for pkt in pkts: + if not 'nic' in pkt or not 'gro' in pkt: + continue + gro = pkt['gro'] + nic = pkt['nic'] + tx_node = pkt['tx_node'] + rx_node = pkt['rx_node'] + qdelay = (gro - nic) - min_latency[tx_node][rx_node] + if pkt_type == 'grant': + color = color_brown + else: + rpc = rpcs[pkt['id']] + if rpc['out_length'] < 1000: + color = color_red + else: + color = color_blue + tx_delays[tx_node][0].append(nic) + tx_delays[tx_node][1].append(qdelay) + tx_delays[tx_node][2].append(color) + rx_delays[rx_node][0].append(nic) + rx_delays[rx_node][1].append(qdelay) + rx_delays[rx_node][2].append(color) + if qdelay > overall_max_delay: + overall_max_delay = qdelay + + # Generate scatter plots + legend_handles = [ + matplotlib.lines.Line2D([], [], color=c, marker='o', + linestyle='None', markersize=8, label=label) + for c, label in [[color_red, 'Data (messages < 1000B)'], + [color_blue, 'Data (other messages)'], + [color_brown, 'Grants']] + ] + x_min = get_last_start() + x_max = get_first_end() + fig, axes = plt.subplots(nrows=len(nodes), ncols=1, sharex=False, + figsize=[8, len(nodes)*2]) + for i in range(len(nodes)): + node = nodes[i] + ax = axes[i] + self.init_qdelay_axis(ax, + 'Incoming Packets on %s' % node, + x_min, x_max, overall_max_delay) + ax.scatter(rx_delays[node][0], rx_delays[node][1], + marker='o', s=1, c=rx_delays[node][2]) + fig.legend(handles=legend_handles, loc='lower center', ncol=3, + bbox_to_anchor=(0.5, -0.03), frameon=False) + plt.tight_layout() + plt.savefig("%s/qdelay_rx.pdf" % (options.plot), bbox_inches='tight') + + fig, axes = plt.subplots(nrows=len(nodes), ncols=1, sharex=False, + figsize=[8, len(nodes)*2]) + for i in range(len(nodes)): + node = nodes[i] + ax = axes[i] + self.init_qdelay_axis(ax, + 'Outgoing Packets from %s' % node, + x_min, x_max, overall_max_delay) + ax.scatter(tx_delays[node][0], tx_delays[node][1], + marker='o', s=1, c=tx_delays[node][2]) + fig.legend(handles=legend_handles, loc='lower center', ncol=3, + bbox_to_anchor=(0.5, -0.03), frameon=False) + plt.tight_layout() + plt.savefig("%s/qdelay_tx.pdf" % (options.plot), bbox_inches='tight') + +#------------------------------------------------ +# Analyzer: rpcs +#------------------------------------------------ +class AnalyzeRpcs: + """ + Print information about Homa RPCs. The options --msglen, --rpc-start, + and --rtt may be used to filter the RPCs to print. By default the RPCs + are printed in order of start time, but that may be changed with the + --sort option. The --sort option is a list of the column names Start, + End, and Rtt; the RPCs will be sorted by each keyword in order before + printing. If --verbose is specified then the packets from the selected + RPCs are also printed. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzePackets') + + def append(self, trace, id, t, name, value): + """ + Add a value to an element of an RPC's dictionary, creating the RPC + and the list if they don't exist already + + trace: Overall information about the trace file being parsed. + id: Identifier for a specific RPC; stats for this RPC are + initialized if they don't already exist + t: Time of the current event + name: Name of a value in the RPC's record; will be created + if it doesn't exist + value: Value to append to the list indicated by id and name + """ + + global rpcs + rpc = rpcs[id] + if not name in rpc: + rpc[name] = [] + rpc[name].append(value) + + def tx_end(self, rpc): + """ + Returns the end of the tx_live interval for RPC; this may be the + last_time in the trace if transmission was incomplete at the end + of the trace, or None if there doesn't appear to be any tx activity + for RPC during the traces. + """ + global rpcs, traces + + if not 'sent' in rpc and (not rpc['send_data']): + return None + + ceiling = None + if 'end' in rpc: + ceiling = rpc['end'] + if not (rpc['id'] & 1): + rx_id = rpc['id']^1 + if rx_id in rpcs: + rx_rpc = rpcs[rx_id] + else: + rx_rpc = {} + if 'recvmsg_done' in rx_rpc: + ceiling = rx_rpc['recvmsg_done'] + elif 'sendmsg' in rx_rpc: + ceiling = rx_rpc['sendmsg'] + elif 'send_data_pkts' in rx_rpc and rx_rpc['send_data_pkts']: + ceiling = rx_rpc['send_data_pkts'][0]['xmit'] + elif rpc['gro_data']: + ceiling = rpc['gro_data'][0][0] + elif 'recvmsg_done' in rpc: + ceiling = rpc['recvmsg_done'] + elif 'sent' in rx_rpc: + ceiling = traces[rx_rpc['node']]['first_time'] + if rpc['send_data']: + if ceiling != None: + return rpc['send_data'][-1][0] + if rpc['send_data'][-1][2] < 1500: + return rpc['send_data'][-1][0] + if 'out_length' in rpc: + length = rpc['out_length'] + for t, offset, pkt_length in rpc['send_data']: + if (offset + pkt_length) >= length: + return rpc['send_data'][-1][0] + if ceiling == None: + return traces[rpc['node']]['last_time'] + return ceiling + + def set_live(self, rpc, peer): + """ + Sets the rx_live and tx_live fields in the given RPC. Peer is + the matching RPC on the peer node, or None if none. + """ + + global rpcs + + # tx_live + node = rpc['node'] + end = self.tx_end(rpc) + start = None + if 'sendmsg' in rpc: + start = rpc['sendmsg'] + elif ('sent' in rpc) or rpc['send_data_pkts']: + start = traces[node]['first_time'] + if start != None: + if end != None: + rpc['tx_live'] = [start, end] + else: + rpc['tx_live'] = [start, traces[node]['last_time'],] + elif end != None: + rpc['tx_live'] = [traces[node]['first_time'], end] + + # rx_live + start = None + if peer and ('sendmsg' in peer): + start = peer['sendmsg'] + else: + if peer and ('sent' in peer): + # Check for special case where both nodes think they are + # sending (first response packet hasn't been sent yet). + if (not 'sent' in rpc) or not (rpc['id'] & 1): + start = traces[peer['node']]['first_time'] + if rpc['gro_data_pkts']: + pkt = rpc['gro_data_pkts'][0] + if 'xmit' in pkt: + t = pkt['xmit'] + else: + t = pkt['gro'] + if (start == None) or (t < start): + start = t + if (start == None) and ('remaining' in rpc): + start = traces[node]['first_time'] + if 'recvmsg_done' in rpc: + end = rpc['recvmsg_done'] + elif 'remaining' in rpc: + end = traces[node]['last_time'] + elif (start != None) and (start >= traces[node]['first_time']): + end = traces[node]['last_time'] + else: + end = None + if (start != None) and (end != None): + rpc['rx_live'] = [start, end] + + def tt_gro_data(self, trace, t, core, peer, id, offset, prio): + global rpcs, recv_offsets + self.append(trace, id, t, 'gro_data', [t, offset, prio]) + rpcs[id]['peer'] = peer + rpcs[id]['gro_core'] = core + recv_offsets[offset] = True + + def tt_gro_grant(self, trace, t, core, peer, id, offset, priority): + self.append(trace, id, t, 'gro_grant', [t, offset]) + rpcs[id]['peer'] = peer + rpcs[id]['gro_core'] = core + + def tt_rpc_handoff(self, trace, t, core, id): + rpcs[id]['handoff'] = t + rpcs.pop('queued', None) + + def tt_ip_xmit(self, trace, t, core, id, offset): + global rpcs + rpcs[id]['ip_xmits'][offset] = t + + def tt_rpc_queued(self, trace, t, core, id): + rpcs[id]['queued'] = t + rpcs.pop('handoff', None) + + def tt_resend_rx(self, trace, t, core, id, offset, length): + global rpcs + rpcs[id]['resend_rx'].append([t, offset, length]) + + def tt_resend_tx(self, trace, t, core, id, peer, offset, length): + global rpcs + rpcs[id]['resend_tx'].append([t, offset]) + + def tt_retransmit(self, trace, t, core, id, offset, length): + global rpcs + rpcs[id]['retransmits'][offset] = [t, length] + + def tt_softirq_data(self, trace, t, core, id, offset, length): + global rpcs + self.append(trace, id, t, 'softirq_data', [t, offset]) + rpcs[id]['in_length'] = length + + def tt_softirq_grant(self, trace, t, core, id, offset, priority, increment): + self.append(trace, id, t, 'softirq_grant', [t, offset]) + + def tt_send_data(self, trace, t, core, id, offset, length): + # Combine the length and other info from this record with the time + # from the ip_xmit call. No ip_xmit call? Skip this record too. + global rpcs + if not offset in rpcs[id]['ip_xmits']: + return + ip_xmits = rpcs[id]['ip_xmits'] + self.append(trace, id, t, 'send_data', [ip_xmits[offset], offset, length]) + del ip_xmits[offset] + + def tt_send_grant(self, trace, t, core, id, offset, priority, increment): + self.append(trace, id, t, 'send_grant', [t, offset, priority, increment]) + + def tt_sendmsg_request(self, trace, t, core, peer, id, length): + global rpcs + rpcs[id]['out_length'] = length + rpcs[id]['peer'] = peer + rpcs[id]['sendmsg'] = t + + def tt_sendmsg_response(self, trace, t, core, id, length): + global rpcs + rpcs[id]['sendmsg'] = t + rpcs[id]['out_length'] = length + + def tt_recvmsg_done(self, trace, t, core, id, status): + global rpcs + rpcs[id]['recvmsg_done'] = t + + def tt_wait_found_rpc(self, trace, t, core, id, type, blocked): + rpcs[id]['found'] = t + + def tt_copy_out_start(self, trace, t, core, id): + global rpcs + if not 'copy_out_start' in rpcs[id]: + rpcs[id]['copy_out_start'] = t + + def tt_copy_out_done(self, trace, t, core, id, start, end): + global rpcs + rpcs[id]['copy_out_done'] = t + + def tt_copy_in_done(self, trace, t, core, id, num_bytes): + global rpcs + rpcs[id]['copy_in_done'] = t + + def tt_unsched(self, trace, t, core, id, num_bytes): + global rpcs, max_unsched + rpcs[id]['unsched'] = num_bytes + if num_bytes > max_unsched: + max_unsched = num_bytes + + def tt_rpc_end(self, trace, t, core, id): + global rpcs + rpcs[id]['end'] = t + + def tt_rpc_incoming(self, trace, t, core, id, peer, received, length): + global rpcs + rpc = rpcs[id] + rpc['peer'] = peer + rpc['in_length'] = length + rpc['remaining'] = length - received + + def tt_rpc_incoming2(self, trace, t, core, id, incoming, granted): + global rpcs + rpc = rpcs[id] + rpc['granted'] = granted + rpc['stats_time'] = t + + def tt_rpc_incoming3(self, trace, t, core, id, length, remaining, rank): + global rpcs + rpcs[id]['rank'] = rank + + def tt_rpc_outgoing(self, trace, t, core, id, peer, sent, length): + global rpcs + rpc = rpcs[id] + rpc['peer'] = peer + rpc['out_length'] = length + rpc['sent'] = sent + + def analyze(self): + """ + Fill in various additional information related to RPCs + """ + global rpcs, traces, ip_to_node + + for id, rpc in rpcs.items(): + peer_id = id ^ 1 + if peer_id in rpcs: + peer_rpc = rpcs[peer_id] + else: + peer_rpc = None + + # Fill in peer_nodes + if 'peer' in rpc: + peer = rpc['peer'] + if (not peer in ip_to_node) and peer_rpc: + ip_to_node[peer] = peer_rpc['node'] + + # Deduce out_length if not already present. + if not 'out_length' in rpc: + if peer_rpc and (peer_rpc['in_length'] != None): + rpc['out_length'] = peer_rpc['in_length'] + else: + length = -1 + if 'send_data' in rpc: + for t, offset, pkt_length in rpc['send_data']: + l2 = offset + pkt_length + if l2 > length: + length = l2 + if 'softirq_grant' in rpc: + for t, offset in rpc['softirq_grant']: + if offset > length: + length = offset + if length >= 0: + rpc['out_length'] = length + + # Set rx_live and tx_live + self.set_live(rpc, peer_rpc) + + # Deduce in_length if not already present. + for id, rpc in rpcs.items(): + if rpc['in_length'] == None: + sender_id = id^1 + if sender_id in rpcs: + sender = rpcs[sender_id] + if 'out_length' in sender: + rpc['in_length'] = sender['out_length'] + + def output(self): + global rpcs, options + + print('\n------------------') + print('Analyzer: rpcs') + print('------------------') + + rpcs_to_print = filter_rpcs(rpcs.values(), msglen=options.msglen, + rpc_start=options.rpc_start, rtt=options.rtt) + if (options.msglen != None or options.rpc_start != None or + options.rtt != None): + print('%d Homa RPCs were selected using the following filters:' % + (len(rpcs_to_print))) + if options.msglen: + print(' --msglen %s' % (options.msglen)) + if options.rpc_start: + print(' --rpc-start %s' % (options.rpc_start)) + if options.rtt: + print(' --rtt %s' % (options.rtt)) + else: + print('There are %d Homa RPCs in the traces' % (len(rpcs_to_print))) + + sort_keys = options.sort + if sort_keys == None: + sort_keys = 'Start' + for key in sort_keys.split(): + if key == 'Start': + rpcs_to_print = sorted(rpcs_to_print, key = lambda rpc: + rpc['sendmsg'] if 'sendmsg' in rpc else 1e20) + elif key == 'End': + rpcs_to_print = sorted(rpcs_to_print, key = lambda rpc: + rpc['recvmsg_done'] if 'recvmsg_done' in rpc else 1e20) + elif key == 'Rtt': + rpcs_to_print = sorted(rpcs_to_print, reverse = True, key = lambda rpc: + rpc['recvmsg_done'] - rpc['sendmsg'] + if 'recvmsg_done' in rpc and 'sendmsg' in rpc else 0) + else: + raise Exception('Unknown sort key \'%s\' for tcp_rpcs ' + 'analyzer' % (key)) + + # Collect and print overall statistics about the RPCs. + xmit = [] + net = [] + free = [] + softirq = [] + recv = [] + srvc = [] + rtt = [] + for rpc in rpcs_to_print: + sid = rpc['id'] ^ 1 + if sid in rpcs: + srpc = rpcs[sid] + else: + srpc = {} + if rpc['send_data_pkts']: + first_req_pkt = rpc['send_data_pkts'][0] + last_req_pkt = rpc['send_data_pkts'][-1] + else: + first_req_pkt = [] + last_req_pkt = [] + if rpc['gro_data_pkts']: + first_resp_pkt = rpc['gro_data_pkts'][0] + last_resp_pkt = rpc['gro_data_pkts'][-1] + else: + first_resp_pkt = [] + last_resp_pkt = [] + if 'nic' in first_req_pkt: + xmit.append(first_req_pkt['nic'] - rpc['sendmsg']) + if 'nic' in first_resp_pkt and 'sendmsg' in srpc: + xmit.append(first_resp_pkt['nic'] - srpc['sendmsg']) + for pkt in itertools.chain(rpc['send_data_pkts'], + rpc['gro_data_pkts']): + if 'gro' in pkt and 'nic' in pkt: + net.append(pkt['gro'] - pkt['nic']) + if 'free_tx_skb' in pkt and 'nic' in pkt: + free.append(pkt['free_tx_skb'] - pkt['nic']) + if 'softirq' in pkt and 'gro' in pkt: + softirq.append(pkt['softirq'] - pkt['gro']) + if 'softirq' in last_req_pkt and 'req_recvd' in rpc: + recv.append(rpc['req_recvd'] - last_req_pkt['softirq']) + if 'softirq' in last_resp_pkt and 'recvmsg_done' in rpc: + recv.append(rpc['recvmsg_done'] - last_resp_pkt['softirq']) + if 'recvmsg_done' in srpc and 'sendmsg' in srpc: + srvc.append(srpc['sendmsg'] - srpc['recvmsg_done']) + if 'sendmsg' in rpc and 'recvmsg_done' in rpc: + rtt.append(rpc['recvmsg_done'] - rpc['sendmsg']) + for l in [xmit, net, free, recv, srvc, rtt]: + l.sort() + + print('\nOverall statistics about the selected RPCs. Most of these ' + 'statistics') + print('combine data from request messages and response messages.') + print('Xmit: Time from sendmsg until driver queued first ' + 'packet for NIC') + print('Net: Time from NIC handoff to GRO receipt for packets') + print('Free: Time from when NIC received packet until packet ' + 'was returned') + print(' to Linux and freed') + print('SoftIrq: Time from when packet was received by GRO until it ' + 'was received') + print(' by SoftIRQ') + print('Recv: Time from SoftIRQ for last packet in a message ' + 'until recvmsg completes') + print('Srvc: Time from recvmsg return on server until ' + 'sendmsg for response') + print('Rtt: Total time from request sendmsg until recvmsg ' + 'completes for response\n') + + print(' Min P10 P50 P90 P99 Max') + pctls = [0, 100, 500, 900, 990, 1000] + print('Xmit %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(xmit, p, '%.1f') for p in pctls)) + print('Net %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(net, p, '%.1f') for p in pctls)) + print('Free %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(free, p, '%.1f') for p in pctls)) + print('SoftIrq %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(softirq, p, '%.1f') for p in pctls)) + print('Recv %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(recv, p, '%.1f') for p in pctls)) + print('Srvc %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(srvc, p, '%.1f') for p in pctls)) + print('Rtt %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(rtt, p, '%.1f') for p in pctls)) + + # Print a summary line for each RPC. + print('\nSummary information for each selected RPC:') + print(print_rpcs(rpcs_to_print, header = True), end='') + + if options.verbose: + first = True + print('\nPackets from the selected RPCs (in the same RPC order as ' + 'above):') + for rpc in rpcs_to_print: + if not first: + print() + print(print_pkts(rpc['send_data_pkts'], header=first), end='') + print(print_pkts(rpc['gro_data_pkts'], header=False), end='') + first = False + +#------------------------------------------------ +# Analyzer: rtt +#------------------------------------------------ +class AnalyzeRtt: + """ + Prints statistics about round-trip times for short RPCs and identifies + RPCs with the longest RTTs. The --max-rtt option can be used to restrict + the time range for the "long" RPCs to print out. + """ + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeRpcs') + return + + def output(self): + global rpcs, ip_to_node, options + + # List with one entry for each short RPC, containing a tuple + # where rtt is the round-trip + # t, id is the client's RPC id, start and end are the beginning + # and ending times, and client and server are the names of the two + # nodes involved. + rtts = [] + + for id, rpc in rpcs.items(): + if id & 1: + continue + if (not 'sendmsg' in rpc) or (not 'recvmsg_done' in rpc): + continue + if (not 'out_length' in rpc) or (rpc['out_length'] > 1500): + continue + if (rpc['in_length'] == None) or (rpc['in_length'] > 1500): + continue + rtts.append([rpc['recvmsg_done'] - rpc['sendmsg'], id, + rpc['sendmsg'], rpc['recvmsg_done'], rpc['node'], + ip_to_node[rpc['peer']]]) + + rtts.sort(key=lambda t : t[0]) + + print('\n-------------') + print('Analyzer: rtt') + print('-------------') + if not rtts: + print('Traces contained no short RPCs (<= 1500 bytes)') + return + print('Round-trip times for %d short RPCs (<= 1500 bytes):' + % (len(rtts))) + print('Min: %6.1f' % rtts[0][0]) + print('P10: %6.1f' % rtts[10*len(rtts)//100][0]) + print('P50: %6.1f' % rtts[50*len(rtts)//100][0]) + print('P90: %6.1f' % rtts[90*len(rtts)//100][0]) + print('P99: %6.1f' % rtts[99*len(rtts)//100][0]) + print('Max: %6.1f' % rtts[len(rtts) - 1][0]) + + def get_phase(rpc1, phase1, rpc2, phase2): + """ + Returns the elapsed time from phase1 in rpc1 to phase2 in + rpc2, or None if the required data is missing. + """ + if phase1 not in rpc1: + return None + start = rpc1[phase1] + if type(start) == list: + if not start: + return None + start = start[0][0] + if phase2 not in rpc2: + return None + end = rpc2[phase2] + if type(end) == list: + if not end: + return None + end = end[0][0] + return end - start + + def get_phases(crpc, srpc): + """ + Returns a dictionary containing the delays for each phase in + the RPC recorded on the client side in crpc and the server side + in srpc. Each phase measures from the end of the previous phase; + if data wasn't available for a phase then the value will be None. + prep: From sendmsg until call to ip*xmit on client + net: To GRO on the server + gro: To SoftIRQ on the server + softirq: To homa_rpc_handoff + handoff: Handoff to waiting thread + queue: Wait on queue for receiving thread (alternative to + handoff: one of these will be None) + sendmsg: To sendmsg call on server + prep2: To call to ip*xmit on server + net2: To GRO on the client + gro2: To SoftIRQ on the client + softirq2: To homa_rpc_handoff on client + handoff2: Handoff to waiting thread + queue2: Wait on queue for receiving thread (only one of + this and handoff2 will be set) + done: To return from sendmsg on client + """ + global rpcs + + result = {} + + result['prep'] = get_phase(crpc, 'sendmsg', crpc, 'send_data') + result['net'] = get_phase(crpc, 'send_data', srpc, 'gro_data') + result['gro'] = get_phase(srpc, 'gro_data', srpc, 'softirq_data') + if 'queued' in srpc: + result['softirq'] = get_phase(srpc, 'softirq_data', srpc, 'queued') + if result['softirq'] < 0: + result['softirq'] = 0 + result['queue'] = get_phase(srpc, 'queued', srpc, 'found') + result['handoff'] = None + else: + result['softirq'] = get_phase(srpc, 'softirq_data', srpc, 'handoff') + if (result['softirq'] != None) and (result['softirq'] < 0): + result['softirq'] = 0 + result['handoff'] = get_phase(srpc, 'handoff', srpc, 'found') + result['queue'] = None + result['sendmsg'] = get_phase(srpc, 'found', srpc, 'sendmsg') + result['prep2'] = get_phase(srpc, 'sendmsg', srpc, 'send_data') + result['net2'] = get_phase(srpc, 'send_data', crpc, 'gro_data') + result['gro2'] = get_phase(crpc, 'gro_data', crpc, 'softirq_data') + if 'queued' in crpc: + result['softirq2'] = get_phase(crpc, 'softirq_data', crpc, 'queued') + if result['softirq2'] < 0: + result['softirq2'] = 0 + result['queue2'] = get_phase(crpc, 'queued', crpc, 'found') + result['handoff2'] = None + else: + result['softirq2'] = get_phase(crpc, 'softirq_data', crpc, 'handoff') + if result['softirq2'] < 0: + result['softirq2'] = 0 + result['handoff2'] = get_phase(crpc, 'handoff', crpc, 'found') + result['queue2'] = None + result['done'] = get_phase(crpc, 'found', crpc, 'recvmsg_done') + return result + + print('\nShort RPCs with the longest RTTs:') + print('RTT: Round-trip time (usecs)') + print('Client Id: RPC id as seen by client') + print('Server: Node that served the RPC') + print('Start: Time of sendmsg invocation on client') + print('Prep: Time until request passed to ip*xmit') + print('Net: Time for request to reach server GRO') + print('GRO: Time to finish GRO and wakeup homa_softirq on server') + print('SIRQ: Time until server homa_softirq invokes homa_rpc_handoff') + print('Handoff: Time to pass RPC to waiting thread (if thread waiting)') + print('Queue: Time RPC is enqueued until receiving thread arrives') + print('App: Time until application wakes up and invokes sendmsg ' + 'for response') + print('Prep2: Time until response passed to ip*xmit') + print('Net2: Time for response to reach client GRO') + print('GRO2: Time to finish GRO and wakeup homa_softirq on client') + print('SIRQ2: Time until client homa_softirq invokes homa_rpc_handoff') + print('Hand2: Time to pass RPC to waiting thread (if thread waiting)') + print('Queue2: Time RPC is enqueued until receiving thread arrives') + print('Done: Time until recvmsg returns on client') + print('') + print(' RTT Client Id Server Start Prep Net GRO SIRQ ' + 'Handoff Queue App Prep2 Net2 GRO2 SIRQ2 Hand2 Queue2 Done') + print('----------------------------------------------------------------' + '----------------------------------------------------------------') + slow_phases = [] + slow_rtt_sum = 0 + to_print = 20 + max_rtt = 1e20 + if options.max_rtt != None: + max_rtt = options.max_rtt + for i in range(len(rtts)-1, -1, -1): + rtt, id, start, end, client, server = rtts[i] + if rtt > max_rtt: + continue + crpc = rpcs[id] + server_id = id ^ 1 + if not server_id in rpcs: + continue + srpc = rpcs[server_id] + phases = get_phases(crpc, srpc) + slow_phases.append(phases) + slow_rtt_sum += rtt + + def fmt_phase(phase, size=6): + if (phase == None): + return ' '*size + else: + return ('%' + str(size) + '.1f') % (phase) + + print('%6.1f %12d %10s %9.3f %s' % (rtt, id, server, start, + fmt_phase(phases['prep'], 4)), end='') + print(' %s %s %s %s' % (fmt_phase(phases['net']), + fmt_phase(phases['gro'], 5), + fmt_phase(phases['softirq'], 4), + fmt_phase(phases['handoff'])), end='') + print(' %s %s %s %s' % ( + fmt_phase(phases['queue']), fmt_phase(phases['sendmsg'], 4), + fmt_phase(phases['prep2'], 5), fmt_phase(phases['net2'])), + end='') + print(' %s %s %s %s %s' % (fmt_phase(phases['gro2'], 5), + fmt_phase(phases['softirq2'], 5), fmt_phase(phases['handoff2']), + fmt_phase(phases['queue2'], 6), fmt_phase(phases['done'], 4))) + to_print -= 1 + if to_print == 0: + break + + # Print out phase averages for fast RPCs. + fast_phases = [] + fast_rtt_sum = 0 + for i in range(len(rtts)): + rtt, id, start, end, client, server = rtts[i] + crpc = rpcs[id] + server_id = id ^ 1 + if not server_id in rpcs: + continue + srpc = rpcs[server_id] + fast_phases.append(get_phases(crpc, srpc)) + fast_rtt_sum += rtt + if len(fast_phases) >= 10: + break + print('\nAverage times for the fastest short RPCs:') + print(' RTT Prep Net GRO SIRQ ' + 'Handoff Queue App Prep2 Net2 GRO2 SIRQ2 Hand2 Queue2 Done') + print('----------------------------------------------------------------' + '----------------------------------------------------------------') + print('%6.1f %33s %4.1f %6.1f %5.1f' % ( + fast_rtt_sum/len(fast_phases), '', + dict_avg(fast_phases, 'prep'), dict_avg(fast_phases, 'net'), + dict_avg(fast_phases, 'gro')), end='') + print(' %4.1f %7.1f %6.1f %4.1f %5.1f' % ( + dict_avg(fast_phases, 'softirq'), dict_avg(fast_phases, 'handoff'), + dict_avg(fast_phases, 'queue'), dict_avg(fast_phases, 'sendmsg'), + dict_avg(fast_phases, 'prep2')), end='') + print(' %6.1f %6.1f %5.1f %6.1f %6.1f %4.1f' % ( + dict_avg(fast_phases, 'net2'), dict_avg(fast_phases, 'gro2'), + dict_avg(fast_phases, 'softirq2'), dict_avg(fast_phases, 'handoff2'), + dict_avg(fast_phases, 'queue2'), dict_avg(fast_phases, 'done'))) + + # Print out how much slower each phase is for slow RPCs than + # for fast ones. + print('\nAverage extra time spent by slow RPCs relative to fast ones:') + print(' RTT Prep Net GRO SIRQ ' + 'Handoff Queue App Prep2 Net2 GRO2 SIRQ2 Hand2 Queue2 Done') + print('----------------------------------------------------------------' + '----------------------------------------------------------------') + print('%6.1f %33s %4.1f %6.1f %5.1f' % ( + slow_rtt_sum/len(slow_phases) - fast_rtt_sum/len(fast_phases), + '', + dict_avg(slow_phases, 'prep') - dict_avg(fast_phases, 'prep'), + dict_avg(slow_phases, 'net') - dict_avg(fast_phases, 'net'), + dict_avg(slow_phases, 'gro') - dict_avg(fast_phases, 'gro')), + end='') + print(' %4.1f %7.1f %6.1f %4.1f %5.1f' % ( + dict_avg(slow_phases, 'softirq') - dict_avg(fast_phases, 'softirq'), + dict_avg(slow_phases, 'handoff') - dict_avg(fast_phases, 'handoff'), + dict_avg(slow_phases, 'queue') - dict_avg(fast_phases, 'queue'), + dict_avg(slow_phases, 'sendmsg') - dict_avg(fast_phases, 'sendmsg'), + dict_avg(slow_phases, 'prep2') - dict_avg(fast_phases, 'prep2')), + end='') + print(' %6.1f %6.1f %5.1f %6.1f %6.1f %4.1f' % ( + dict_avg(slow_phases, 'net2') - dict_avg(fast_phases, 'net2'), + dict_avg(slow_phases, 'gro2') - dict_avg(fast_phases, 'gro2'), + dict_avg(slow_phases, 'softirq2') - dict_avg(fast_phases, 'softirq2'), + dict_avg(slow_phases, 'handoff2') - dict_avg(fast_phases, 'handoff2'), + dict_avg(slow_phases, 'queue2') - dict_avg(fast_phases, 'queue2'), + dict_avg(slow_phases, 'done') - dict_avg(fast_phases, 'done'))) + + +#------------------------------------------------ +# Analyzer: rx +#------------------------------------------------ +class AnalyzeRx: + """ + Generates one data file for each node showing various statistics + related to incoming message reception as a function of time, including + data rate, live messages, info about outstanding grants, and where + incoming data packets are curently located (qdisc, net, gro). Requires + the --data and --gbps options. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeIntervals') + return + + def output(self): + global intervals, options + + print('\n------------') + print('Analyzer: rx') + print('------------') + if options.data == None: + print('--data option wasn\'t specified, so no output generated.') + return + print('See data files rx_*.dat in %s\n' % (options.data)) + print('Average receive throughput:') + + for node in get_sorted_nodes(): + f = open('%s/rx_%s.dat' % (options.data, node), 'w') + f.write('# Node: %s\n' % (node)) + f.write('# Generated at %s.\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + f.write('# Statistics about messages received by node ') + f.write('%s over %d usec intervals:\n' % (node, options.interval)) + f.write('# Time: End of the time interval\n') + f.write('# Gbps: Rate of data received by GRO during the interval\n') + f.write('# Live: Messages for which at least one packet has ' + 'been transmitted\n') + f.write('# by the peer, but which have not been fully ' + 'received by SoftIRQ,\n') + f.write('# as of the end of the interval\n') + f.write('# Pkts: Packets received by GRO during the interval\n') + f.write('# Grantable: # of incoming RPCs that are not fully ' + 'granted\n') + f.write('# TxGrant: KB of new grants passed to ip*xmit during ' + 'the interval\n') + f.write('# Granted: KB of grants that have been sent, but for ' + 'which corresponding\n') + f.write('# data packets have not been transmitted by ' + 'the peer\n') + f.write('# IP: KB of data that have been passed to ip*xmit ' + 'on sender (or\n') + f.write(' requeued by homa_qdisc after being ' + 'deferred) but not yet\n') + f.write(' transmitted by NIC; large numbers probably ' + 'indicate qdisc backup\n') + f.write('# Net: KB of data that have been passed to the ' + 'NIC but not\n') + f.write('# yet received by GRO\n') + f.write('# Late: KB of data transmitted by NIC > %d us ago ' + '(%d is the value\n' % (options.late, options.late)) + f.write('# of the --late option) but not yet ' + 'seen by receiver\'s GRO\n') + f.write('# GRO: KB of data that have been received by ' + 'GRO but not yet\n') + f.write('# received by SoftIRQ\n') + + f.write('\n# Time Gbps Live Pkts Grantable TxGrant Granted' + ' IP Net Late GRO\n') + total = 0 + for interval in intervals[node]: + if not 'rx_bytes' in interval: + print('Strange interval for %s: %s' % (node, interval)) + gbps = interval['rx_bytes'] * 8 / (options.interval * 1000) + total += gbps + f.write('%8.1f %6.1f %5d %4d %4d %5.0f %5.0f ' + '%5.0f %5.0f %5.0f %5.0f\n' + % (interval['time'], gbps, + interval['rx_live'], + interval['rx_pkts'], + interval['rx_grantable'], + interval['rx_new_grants'] * 1e-3, + interval['rx_granted'] * 1e-3, + interval['rx_data_qdisc'] * 1e-3, + interval['rx_data_net'] * 1e-3, + interval['rx_overdue'] * 1e-3, + interval['rx_data_gro'] * 1e-3)) + f.close() + print('%-10s %6.1f Gbps' % (node, total/len(intervals[node]))) + +#------------------------------------------------ +# Analyzer: rxbufs +#------------------------------------------------ +class AnalyzeRxbufs: + """ + Analyzes lifetimes of skbs for incoming packets to compute total buffer + usage for each channel and underflows of NIC buffer caches (based on + caching mechanism of Mellanox mlx5 driver). + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzePackets') + + def output(self): + global packets, rpcs + + # List of records, where type is + # "alloc" or "free", id is a packet id, core is the core where + # homa_gro_receive processed the packet (in the form "node.core"), + # and length is the number of bytes consumed by the packet. + events = [] + + # Core number (node.core) -> total number of bytes received so far + # by homa_gro_receive on that core. + core_bytes = defaultdict(lambda : 0) + + # Packet id -> tuple, where gro_time is the + # time when the packet was processed by homa_gro_receive and + # core_bytes is the value of core_bytes just before the packet + # was allocated. + pkt_allocs = {} + + # Core id -> , where active_bytes + # is the largest number of active skb bytes seen for that core, time + # is the time when some of those bytes were finally freed, pid is the + # id of the packet freed at time, and gro_time is the time when that + # packet was processed by homa_gro_receive. + core_max = defaultdict(lambda : [0, 0, '', 0]) + + # Scan all packets to build the events list. Note: change packet + # ids to refer to those on the receiver, not sender. + for pkt in packets.values(): + if not 'gro' in pkt: + continue + rpc_id = pkt['id'] ^ 1 + pkid = '%d:%d' % (rpc_id, pkt['offset']) + rpc = rpcs[rpc_id] + core = '%s.%d' % (rpc['node'], rpc['gro_core']) + events.append([pkt['gro'], 'alloc', pkid, core, pkt['length']]) + if 'free' in pkt: + events.append([pkt['free'], 'free', pkid, core, pkt['length']]) + + # Process the events in time order + events.sort(key=lambda t : t[0]) + for time, type, pkid, core, length in events: + if type == 'alloc': + pkt_allocs[pkid] = [time, core_bytes[core]] + core_bytes[core] += length + elif type == 'free': + if pkid in pkt_allocs: + active_bytes = core_bytes[core] - pkt_allocs[pkid][1] + if active_bytes > core_max[core][1]: + core_max[core] = [time, active_bytes, pkid, + pkt_allocs[pkid][0]] + else: + print('Bogus event type %s in nicbufs analzyer' % (type), + file=sys.stderr) + + + print('\n-----------------') + print('Analyzer: rxbufs') + print('-----------------') + print('Maximum active NIC buffer space used for each GRO core over the') + print('life of the traces (assuming Mellanox mlx5 buffer cache):') + print('Active: Maximum bytes of NIC buffers used by the core (bytes') + print(' allocated on Core between when PktId was received and') + print(' when PktId was freed)') + print('PktId: Identifier (as seen by receiver) for the packet ') + print(' corresponding to Active') + print('Node: Node where Pktid was received') + print('Core: Core on which Pktid was received') + print('GRO: Time when homa_gro_receive processed Pktid on Core') + print('Free: Time when packet was freed after copying to user space') + print('Life: Packet lifetime (Free - GRO, usecs)\n') + + maxes = [] + for core, max in core_max.items(): + time, active, pkid, gro_time = max + maxes.append([core, time, active, pkid, gro_time]) + maxes.sort(key=lambda t : t[2], reverse = True) + print(' Active PktId Node Core GRO ' + 'Free Life') + print('-------------------------------------------------------------' + '------------') + for core, time, active, pkid, gro_time in maxes: + node, core_id = core.split('.') + print('%8d %20s %10s %4s %9.3f %9.3f %7.1f' % (active, pkid, + node, core_id, gro_time, time, time - gro_time)) + +#------------------------------------------------ +# Analyzer: rxsnapshot +#------------------------------------------------ +class AnalyzeRxsnapshot: + """ + Prints information about the state of incoming messages to a particular + node at a given time. Requires the --node and --time options. + """ + + def __init__(self, dispatcher): + global options + require_options('snapshot', 'time', 'node') + dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzePackets') + + def collect_live_rpcs(node, t, receive): + """ + Collects information about RPCs that are live at a given time and + returns a dictionary with information about each relevant RPC. + + node: Only return RPCs that involve this node. + t: Time of interest + receive: True means only return RPCs with live messages being sent + to node at t; false means return RPCs with live messages + being sent from node at t. + + The return value is a dictionary mapping RPC id -> dictionary, where + id is the id of the sending RPC and the dictionary contains the + following values: + pkts: List of all the data packets in this RPC + grants: List of all the grant packets in this RPC + unsched: Number of bytes of unscheduled incoming data, + or 0 if unknown + min_time: Lowest "interesting" time seen in any packet + for this RPC + lost: Number of packets that appear to have been lost + (transmitted but not received after long delay) + + pre_xmit2: Offset just after highest byte sent in a data + packet with 'xmit2' < target time + post_xmit2: Lowest offset contained in a data packet with + 'xmit2' >= target time + pre_gro and post_gro: + Same, except measured with 'gro' instead of 'xmit2' + pre_softirq and post_softirq: + Same, except measured with 'softirq' instead of 'xmit2' + pre_copied and post_copied: + Same, except measured with 'copied' instead of 'xmit2' + + The following offsets record things that happened either before + or after the target time. + pre_grant_xmit: Highest end offset seen in a grant with 'xmit' + < target time + post_grant_xmit: Lowest (starting) offset seen in a grant with + 'xmit' >= target time + pre_grant_gro and post_grant_gro: + Same, except measured with 'gro' instead of 'xmit' + pre_grant_softirq and post_grant_softirq: + Same, except measured with 'softirq' instead + of 'xmit' + + The following offsets are derived from those above and used for + sorting the RPCs in "closest to completion" order. + sort_grant_xmit pre_grant_xmit (if nonzero) else sort_grant_gro + sort_grant_gro pre_grant_gro (if nonzero) else sort_grant_softirq + sort_grant_softirq pre_grant_softirq (if nonzero) else pre_xmit2 + """ + global packets, grants, rpcs, options, traces, max_unsched + + live_rpcs = defaultdict(lambda : {'pkts': [], 'grants': [], + 'pre_xmit2': 0, 'post_xmit2': 1e20, + 'pre_gro': 0, 'post_gro': 1e20, + 'pre_softirq': 0, 'post_softirq': 1e20, + 'pre_copied': 0, 'post_copied': 1e20, + 'pre_grant_xmit': 0, 'post_grant_xmit': 1e20, + 'pre_grant_gro': 0, 'post_grant_gro': 1e20, + 'pre_grant_softirq': 0, 'post_grant_softirq': 1e20, + 'lost': 0, 'min_time': 1e20, 'unsched': max_unsched + }) + + def check_live(tx_id, node, t, receive): + """ + If receive is True, returns whether the RPC given by tx_id is live + for receiving on node at t. Otherwise returns whether tx_id is live + for sending on node at t. In either case, tx_id is the RPC id on + atthe sender. + """ + if receive: + if not tx_id^1 in rpcs: + return False + rx_rpc = rpcs[tx_id^1] + if rx_rpc['node'] != node: + return False + if not 'rx_live' in rx_rpc: + return False + start, end = rx_rpc['rx_live'] + else: + if not id in rpcs: + return False + tx_rpc = rpcs[tx_id] + if tx_rpc['node'] != node: + return False + if not 'tx_live' in tx_rpc: + return False + start, end = tx_rpc['tx_live'] + return (start <= t) and (end > t) + + # Collect info from data packets + for pkt in packets.values(): + id = pkt['id'] + if not check_live(id, node, t, receive): + continue + + live_rpc = live_rpcs[id] + live_rpc['pkts'].append(pkt) + + offset = pkt['offset'] + end_offset = offset + pkt['length'] + for type in ['xmit2', 'gro', 'softirq', 'copied']: + if (type in pkt): + pkt_time = pkt[type] + if pkt_time < t: + if end_offset > live_rpc['pre_' + type]: + live_rpc['pre_' + type] = end_offset + else: + if offset < live_rpc['post_' + type]: + live_rpc['post_' + type] = offset + + # Collect info from grant packets + for pkt in grants.values(): + id = pkt['id']^1 + if not check_live(id, node, t, receive): + continue + + live_rpc = live_rpcs[id] + live_rpc['grants'].append(pkt) + + end_offset = pkt['offset'] + offset = end_offset - pkt['increment'] + for type in ['xmit', 'gro', 'softirq']: + if (type in pkt): + pkt_time = pkt[type] + if pkt_time < t: + if end_offset > live_rpc['pre_grant_' + type]: + live_rpc['pre_grant_' + type] = end_offset + else: + if offset < live_rpc['post_grant_' + type]: + live_rpc['post_grant_' + type] = offset + + # Collect info about RPCs for which no data or grant packets + # were transmitted. + for id, tx_rpc in rpcs.items(): + if id in live_rpcs: + continue + if not check_live(id, node, t, receive): + continue + if 'sent' in tx_rpc: + live_rpcs[id]['pre_xmit2'] = tx_rpc['sent'] + else: + live_rpcs[id]['pre_xmit2'] = 0 + + # Deduce missing fields (or improve estimates) in RPCs where possible + for id, live_rpc in live_rpcs.items(): + next_stage = 0 + if id^1 in rpcs: + rx_rpc = rpcs[id^1] + else: + rx_rpc = {} + if 'remaining' in rx_rpc: + rcvd = rx_rpc['in_length'] - rx_rpc['remaining'] + if live_rpc['post_copied'] > 1e19: + live_rpc['post_copied'] = rcvd + for type in ['copied', 'softirq', 'gro', 'xmit2']: + pre_field = 'pre_' + type + post_field = 'post_' + type + pre = live_rpc[pre_field] + post = live_rpc[post_field] + + # We can correct for missing information by using packets + # after the target time, or packets from the next stage: + # (e.g. if a byte got to SoftIRQ it must have been received + # by GRO). + if post < 1e20 and post > pre: + pre = post + if next_stage > pre: + pre = next_stage + live_rpc[pre_field] = pre + next_stage = pre + + # Deduce missing grant fields where possible. + next_stage = 0 + unsched = 0 + if 'unsched' in rx_rpc: + unsched = rx_rpc['unsched'] + live_rpc['unsched'] = unsched + if 'granted' in rx_rpc and live_rpc['post_grant_softirq'] >= 1e19: + live_rpc['post_grant_softirq'] = rx_rpc['granted'] + if (unsched > 0 and live_rpc['pre_xmit2'] > unsched and + live_rpc['pre_xmit2'] > live_rpc['pre_grant_softirq']): + # We sent unscheduled packets: they must have been granted. + live_rpc['pre_grant_softirq'] = live_rpc['pre_xmit2'] + for type in ['softirq', 'gro', 'xmit']: + pre_field = 'pre_grant_' + type + post_field = 'post_grant_' + type + pre = live_rpc[pre_field] + post = live_rpc[post_field] + if id == 999999: + print('Id %d before inference post_grant_%s %d, ' + 'pre_grant_%s %d, next_stage %d' % (id, type, + post, type, pre, next_stage)) + if post < 1e20 and post > pre: + pre = post + if next_stage > pre: + pre = next_stage + live_rpc[pre_field] = pre + next_stage = pre + if id == 999999: + print('Id %d after inference post_grant_%s %d, ' + 'pre_grant_%s %d, next_stage %d' % (id, type, + live_rpc[post_field], type, pre, next_stage)) + + # Fields for sorting. + if live_rpc['pre_grant_softirq']: + live_rpc['sort_grant_softirq'] = live_rpc['pre_grant_softirq'] + else: + live_rpc['sort_grant_softirq'] = live_rpc['pre_xmit2'] + if live_rpc['pre_grant_gro']: + live_rpc['sort_grant_gro'] = live_rpc['pre_grant_gro'] + else: + live_rpc['sort_grant_gro'] = live_rpc['sort_grant_softirq'] + if live_rpc['pre_grant_xmit']: + live_rpc['sort_grant_xmit'] = live_rpc['pre_grant_xmit'] + else: + live_rpc['sort_grant_xmit'] = live_rpc['sort_grant_gro'] + + # Count lost packets in the RPC. + for pkt in live_rpc['pkts']: + if (('xmit2' in pkt) and (not 'gro' in pkt) + and (pkt['xmit2'] >= traces[node]['first_time']) + and ((options.time - pkt['xmit2']) > 200) + and (options.time < traces[node]['last_time'])): + live_rpc['lost'] += 1 + print('Lost packet: %s' % (pkt)) + return live_rpcs + + def get_sorted_ids(live_rpcs): + """ + Given the results from collect_live_rpcs, return a list of the + ids in live_rpcs, sorted based on how nearly complete the + messages are (most nearly complete first). + """ + + def sort_key(live_rpcs, id, field): + if id in rpcs: + length = rpcs[id]['out_length'] + if length == None: + length = 0 + else: + length = 0 + if not field in live_rpcs[id]: + print('Missing field %s in id %d: %s' % (field, id, live_rpcs[id])) + return length - live_rpcs[id][field] + sorted_ids = sorted(live_rpcs.keys(), + key = lambda id : live_rpcs[id]['pre_copied'], + reverse = True) + sorted_ids = sorted(sorted_ids, + key = lambda id : sort_key(live_rpcs, id, 'pre_copied')) + sorted_ids = sorted(sorted_ids, + key = lambda id : sort_key(live_rpcs, id, 'pre_softirq')) + sorted_ids = sorted(sorted_ids, + key = lambda id : sort_key(live_rpcs, id, 'pre_gro')) + sorted_ids = sorted(sorted_ids, + key = lambda id : sort_key(live_rpcs, id, 'pre_xmit2')) + sorted_ids = sorted(sorted_ids, + key = lambda id : sort_key(live_rpcs, id, 'sort_grant_softirq')) + sorted_ids = sorted(sorted_ids, + key = lambda id : sort_key(live_rpcs, id, 'sort_grant_gro')) + sorted_ids = sorted(sorted_ids, + key = lambda id : sort_key(live_rpcs, id, 'sort_grant_xmit')) + + # Separate out messsages for which some packets have been received + # by GRO from those that have no received packets. + got_gro = [] + no_gro = [] + for id in sorted_ids: + if live_rpcs[id]['pre_gro'] > 0: + got_gro.append(id) + else: + no_gro.append(id) + sorted_ids = got_gro + no_gro + + return sorted_ids + + def count_data(self, rpc, start_time, end_time): + """ + Return a count of the number of message bytes present in all + data packets received for @rpc between @start_time and @end_time. + """ + + result = 0 + for pkt in rpc['softirq_data_pkts']: + softirq = pkt['softirq'] + if (softirq >= start_time) and (softirq < end_time): + result += pkt['length'] + return result + + def output(self): + global packets, rpcs, options, traces + + live_rpcs = AnalyzeRxsnapshot.collect_live_rpcs(options.node, + options.time, True) + sorted_ids = AnalyzeRxsnapshot.get_sorted_ids(live_rpcs) + + print('\n--------------------') + print('Analyzer: rxsnapshot') + print('--------------------') + print('A snapshot of incoming messages to %s at time %.1f' % ( + options.node, options.time)) + + print('\n%d RPCs have live incoming messages:' % + (len(live_rpcs))) + print('Id: RPC identifier on the receiver side') + print('Peer: Sending node') + print('Length: Length of incoming message, if known') + print('Gxmit: Highest offset for which grant has been passed ' + 'to ip_*xmit') + print('RxRem: Bytes in message that haven\'t yet been received ' + '(Length - Gro);') + print(' smaller means higher SRPT priority for grants') + print('GGro: Highest offset in grant that has been received by GRO') + print('GSoft: Highest offset in grant that has been processed ' + 'by SoftIRQ') + print('Xmit: Offset just after last byte that has been ' + 'passed to ip*xmit') + print(' or requeued by homa_qdisc after deferral') + print('Gro: Offset just after last data byte that has been ' + 'processed by GRO') + print('SoftIrq: Offset just after last data byte that has been ' + 'processed by SoftIRQ') + print('Copied: Offset just after last data byte that has been ' + 'copied to user space') + print('Incoming: Gxmit - SoftIrq') + print('Lost: Packets that appear to have been dropped in the network') + print(' Id Peer Length RxRem GXmit GGro GSoft ', + end='') + print(' Xmit Gro SoftIrq Copied Incoming Lost') + print('--------------------------------------------------------------', end='') + print('---------------------------------------------') + + for id in sorted_ids: + rx_rpc = rpcs[id^1] + live_rpc = live_rpcs[id] + post_data = self.count_data(rx_rpc, options.time, + rx_rpc['stats_time'] if 'stats_time' in rx_rpc else 1e20) + if 'remaining' in rx_rpc: + received = rx_rpc['in_length'] - rx_rpc['remaining'] - post_data + else: + received = rx_rpc['in_length'] - post_data + incoming = live_rpc['pre_grant_xmit'] - received + if incoming <= 0: + incoming = '' + if rx_rpc['in_length']: + rx_rem = rx_rpc['in_length'] - live_rpc['pre_gro'] + else: + rx_rem = "" + print('%10d %-10s %7s %7s %7s %7s %7s ' % (id^1, + rpcs[id]['node'] if id in rpcs else "", + rx_rpc['in_length'] if rx_rpc['in_length'] != None else "", + rx_rem, + str(live_rpc['pre_grant_xmit']) + if live_rpc['pre_grant_xmit'] > live_rpc['unsched'] else "", + str(live_rpc['pre_grant_gro']) + if live_rpc['pre_grant_gro'] > live_rpc['unsched'] else "", + str(live_rpc['pre_grant_softirq']) + if live_rpc['pre_grant_softirq'] > live_rpc['unsched'] + else ""), end='') + print('%7d %7d %7d %7d %7s %4d' % (live_rpc['pre_xmit2'], + live_rpc['pre_gro'], live_rpc['pre_softirq'], + live_rpc['pre_copied'], incoming, live_rpc['lost'])) + + print('\nFields in the tables below:') + print('Id: Packet\'s RPC identifier on the receiver side') + print('Offset: Starting offset of packet data within its message') + print('TxCore: Core where sender passed packet to ip*xmit') + print('GCore: Core where receiver GRO processed packet') + print('SCore: Core where receiver SoftIRQ processed packet') + print('Xmit: Time when sender passed packet to ip*xmit or when ' + 'sender qdisc') + print(' requeued packet after deferral, whichever is later') + print('Nic: Time when sender handed off packet to NIC') + print('Free: Time when packet buffer freed after tx') + print('Gro: Time when receiver GRO processed packet') + print('SoftIrq: Time when receiver SoftIRQ processed packet') + print('Numbers in parentheses give the difference between the ' + 'preceding value') + print('and the reference time') + + # Generate a line with overall info about the state of incoming + # data for an RPC. + trace_start = traces[options.node]['first_time'] + for tx_id in sorted_ids: + live_rpc = live_rpcs[tx_id] + rx_rpc = rpcs[tx_id^1] + info = '' + prefix = ' (' + if rx_rpc['in_length'] != None: + info += '%s%d bytes' % (prefix, rx_rpc['in_length']) + prefix = ', ' + received = get_received(rx_rpc, options.time) + if received != None: + info += '%sreceived %d' % (prefix, received) + prefix = ', ' + got_gro = get_granted(rx_rpc, options.time) + if got_gro != None: + info += '%sgranted %d' % (prefix, got_gro) + prefix = ', ' + if 'peer' in rx_rpc: + info += '%speer %s' % (prefix, ip_to_node[rx_rpc['peer']]) + if info: + info += ')' + + live_rpc['pkts'].sort(key = lambda d : d['offset']) + net_pkts = [] + gro_pkts = [] + for pkt in live_rpc['pkts']: + offset = pkt['offset'] + keep = True + if 'xmit2' in pkt: + xmit2 = pkt['xmit2'] + if xmit2 >= options.time: + keep = False + if ((xmit2 < trace_start) and (not 'gro' in pkt) and + (not 'copied' in pkt)): + keep = False + elif offset >= live_rpc['pre_xmit2']: + keep = False + if 'gro' in pkt: + if pkt['gro'] < options.time: + keep = False + elif offset < live_rpc['pre_gro']: + keep = False + if keep: + net_pkts.append(pkt) + + keep = True + if 'gro' in pkt: + if pkt['gro'] >= options.time: + keep = False + elif offset >= live_rpc['pre_gro']: + keep = False + if 'softirq' in pkt: + if pkt['softirq'] < options.time: + keep = False + elif offset < live_rpc['pre_softirq']: + keep = False + if keep: + gro_pkts.append(pkt) + + live_rpc['grants'].sort(key = lambda d : d['offset']) + net_grants = [] + gro_grants = [] + for pkt in live_rpc['grants']: + offset = pkt['offset'] + keep = True + if 'xmit' in pkt: + if pkt['xmit'] > options.time: + keep = False + elif offset > live_rpc['pre_xmit2']: + keep = False + if 'gro' in pkt: + if pkt['gro'] <= options.time: + keep = False + elif offset <= live_rpc['pre_gro']: + keep = False + if keep: + net_grants.append(pkt) + + keep = True + if 'gro' in pkt: + if pkt['gro'] > options.time: + keep = False + elif offset > live_rpc['pre_gro']: + keep = False + if 'softirq' in pkt: + if pkt['softirq'] <= options.time: + keep = False + elif offset <= live_rpc['pre_softirq']: + keep = False + if keep: + gro_grants.append(pkt) + + if (not net_pkts) and (not gro_pkts) and (not net_grants) and ( + not gro_grants): + continue + print('\nRPC id %d%s:' % (tx_id^1, info)) + + if net_pkts: + print('Incoming data packets that have been transmitted but ' + 'not received by GRO:') + print('Offset Xmit TxCore Nic ' + 'Gro Free GCore') + for pkt in net_pkts: + print('%6d %7s %-10s %4s %7s %8s %7s %8s %7s %8s %5s' % ( + pkt['offset'], print_field_if(pkt, 'xmit2', '%7.1f'), + print_field_if(pkt, 'xmit2', '(%7.1f)', + lambda t : t - options.time ), + print_field_if(pkt, 'tx_core', '%4d'), + print_field_if(pkt, 'nic', '%7.1f'), + print_field_if(pkt, 'nic', '(%6.1f)', + lambda t : t - options.time), + print_field_if(pkt, 'gro', '%7.1f'), + print_field_if(pkt, 'gro', '(%6.1f)', + lambda t : t - options.time), + print_field_if(pkt, 'free_tx_skb', '%7.1f'), + print_field_if(pkt, 'free_tx_skb', '(%6.1f)', + lambda t : t - options.time), + print_field_if(pkt, 'gro_core', '%3d'))) + + if gro_pkts: + print('Incoming data packets that have been seen by GRO but ' + 'not yet by SoftIRQ:') + print('Offset Gro GCore SoftIRQ SCore') + for pkt in gro_pkts: + print('%6d %7s %9s %5s %7s %8s %7s' % ( + pkt['offset'], print_field_if(pkt, 'gro', '%7.1f'), + print_field_if(pkt, 'gro', '(%7.1f)', + lambda t : t - options.time), + print_field_if(pkt, 'gro_core', '%3d'), + print_field_if(pkt, 'softirq', '%7.1f'), + print_field_if(pkt, 'softirq', '(%6.1f)', + lambda t : t - options.time), + print_field_if(pkt, 'softirq_core', '%3d'))) + + if net_grants: + print('Outgoing grants that have been passed to ip*xmit but ' + 'not received by GRO:') + print('Offset Xmit TxCore Gro GCore') + for pkt in net_grants: + print('%6d %7s %-10s %4s %7s %8s %5s' % ( + pkt['offset'], print_field_if(pkt, 'xmit', '%7.1f'), + print_field_if(pkt, 'xmit', '(%7.1f)', + lambda t : t - options.time ), + print_field_if(pkt, 'tx_core', '%4d'), + print_field_if(pkt, 'gro', '%7.1f'), + print_field_if(pkt, 'gro', '(%6.1f)', + lambda t : t - options.time), + print_field_if(pkt, 'gro_core', '%3d'))) + if gro_grants: + print('Outgoing grants that have been seen by GRO but not ' + 'yet by SoftIRQ:') + print('Offset Gro GCore SoftIRQ SCore') + for pkt in gro_grants: + print('%6d %7s %9s %5s %7s %8s %7s' % ( + pkt['offset'], print_field_if(pkt, 'gro', '%7.1f'), + print_field_if(pkt, 'gro', '(%7.1f)', + lambda t : t - options.time), + print_field_if(pkt, 'gro_core', '%3d'), + print_field_if(pkt, 'softirq', '%7.1f'), + print_field_if(pkt, 'softirq', '(%6.1f)', + lambda t : t - options.time), + print_field_if(pkt, 'softirq_core', '%3d'))) + +#------------------------------------------------ +# Analyzer: smis +#------------------------------------------------ +class AnalyzeSmis: + """ + Prints out information about SMIs (System Management Interrupts) that + occurred during the traces. An SMI causes all of the cores on a node + to freeze for a significant amount of time. + """ + def __init__(self, dispatcher): + # A list of tuples, each of which describes one + # gap that looks like an SMI. + self.smis = [] + + # Time of the last trace record seen. + self.last_time = None + return + + def tt_all(self, trace, t, core, msg): + if self.last_time == None: + self.last_time = t + return + if (t - self.last_time) > 50: + self.smis.append([self.last_time, t, trace['node']]) + self.last_time = t + + def output(self): + print('\n-------------------') + print('Analyzer: smis') + print('-------------------') + print('Gaps that appear to be caused by System Management ' + 'Interrupts (SMIs),') + print('which freeze all cores on a node simultaneously:') + print('') + print(' Start End Gap Node') + print('-----------------------------------') + for smi in sorted(self.smis, key=lambda t : t[0]): + start, end, node = smi + print('%9.3f %9.3f %6.1f %s' % (start, end, end - start, node)) + +#------------------------------------------------ +# Analyzer: tcp_rpcs +#------------------------------------------------ +class AnalyzeTcp_rpcs: + """ + Print information about RPCs that were transmitted with TCP. The options + --msglen, --rpc-start, and --rtt may be used to filter the RPCs to print. + By default the RPCs are printed in order of start time, but that may be + changed with the --sort option. The --sort option is a list of the + column names Start, End, and Rtt; the RPCs will be sorted by each keyword + in order before printing. If --verbose is specified then the packets from + the selected RPCs are also printed. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeTcppackets') + + # "source dest" -> list of entries in tcp_rpcs whose client and + # server fields match source and dest. + self.rpcs = defaultdict(list) + + # "source dest" -> list of tuples for + # all of the recvmsg completions for this stream. Time is the time + # when recvmsgc completed, sequence is the sequence number + # just after the last one returned, and length is the number of + # bytes returned. + self.recvs = defaultdict(list) + + def tt_tcp_sendmsg(self, trace, t, core, source, dest, msg_length, + sequence, slot, response): + global tcp_rpcs + + # Create a new entry in tcp_rpcs for this message. At this point + # we don't have enough information to pair request and response + # messages, so create separate "rpcs" for each, pretending all + # messages are requests. The analyze method will combine requests + # and responses into a single entry. + rpc = { + 'client': source, + 'server': dest, + 'req_send': t, + 'req_length': msg_length, + 'req_seq': sequence, + 'req_end_seq': sequence + msg_length, + 'slot': slot, + 'req_pkts': [], + 'resp_pkts': [] + } + if response: + rpc['response'] = 1 + tcp_rpcs[f'{source} {dest} {sequence}'] = rpc + self.rpcs[f'{source} {dest}'].append(rpc) + + def tt_tcp_recvmsg(self, trace, t, core, source, dest, msg_length, + sequence): + self.recvs[f'{source} {dest}'].append([t, sequence, msg_length]) + + def del_rpc(self, rpc): + """ + Remove an entry from tcp_rpcs. + msg: Entry to remove (it's incomplete: describes either a + request or a response but not both) + """ + global tcp_rpcs + + del tcp_rpcs[f"{rpc['client']} {rpc['server']} {rpc['req_seq']}"] + + def merge(self, request, response): + """ + Move information from an RPC that contains only a response to + an RPC that currently contains only a request. + request: Entry in tcp_rpcs that describes a request message + response: Entry in tcp_repcs that describes the response + corresponding to request. This will be deleted. + """ + request['resp_send'] = response['req_send'] + request['resp_seq'] = response['req_seq'] + request['resp_length'] = response['req_length'] + request['resp_end_seq'] = response['req_end_seq'] + request['resp_pkts'] = response['req_pkts'] + if 'req_recvd' in response: + request['resp_recvd'] = response['req_recvd'] + self.del_rpc(response) + + def analyze(self): + """ + Finish the creation of tcp_rpcs + """ + global tcp_packets, tcp_rpcs + + # -> list of data packets (nonzero length) from + # source to dest. + stream_pkts = defaultdict(list) + + # Bucket TCP packets in the same way as self.messages. + for pkt in tcp_packets.values(): + if pkt['length'] == 0: + continue + key = '%s %s' % (pkt['source'], pkt['dest']) + stream_pkts[key].append(pkt) + + # Assign TCP packets to messages. Each iteration through this + # loop processes a source-dest pair, working through messages + # and packets in sequence order. A single packet could contain + # parts of multiple messages. + for key, rpcs in self.rpcs.items(): + rpcs.sort(key = lambda rpc: rpc['req_seq']) + pkts = sorted(stream_pkts[key], + key = lambda pkt: pkt['seq_ack']) + pkt_ix = 0 + + for rpc in rpcs: + rpc_start = rpc['req_seq'] + rpc_end = rpc['req_end_seq'] + while pkt_ix < len(pkts): + pkt = pkts[pkt_ix] + pkt_start = pkt['seq_ack'] + pkt_end = pkt_start + pkt['length'] + if pkt_end <= rpc_start: + pkt_ix += 1 + continue + if pkt_start >= rpc_end: + break + rpc['req_pkts'].append(pkt) + if pkt_end <= rpc_end: + pkt_ix += 1 + else: + break + + # Add req_gro times to RPCs. + for key, rpcs in self.rpcs.items(): + rpcs.sort(key = lambda rpc: rpc['req_end_seq']) + recvs = sorted(self.recvs[key], key = lambda t: t[1], + reverse = True) + + for rpc in rpcs: + while recvs: + t, sequence, length = recvs[-1] + rpc_end = rpc['req_end_seq'] + if sequence < rpc_end: + recvs.pop() + continue + if sequence - length < rpc_end: + rpc['req_recvd'] = t + break + + # Find matching pairs of request and response messages and merge + # them into single (and complete) RPCs. + for key, rpcs in list(self.rpcs.items()): + # slot -> List of requests in the forward direction and responses + # in the reverse direction for this slot and client-server pair, + # sorted by sendmsg time. + slot_rpcs = defaultdict(list) + + for rpc in rpcs: + if 'response' in rpc: + continue; + slot_rpcs[rpc['slot']].append(rpc) + source, dest = key.split() + for rpc in self.rpcs[f'{dest} {source}']: + if not 'response' in rpc: + continue + slot_rpcs[rpc['slot']].append(rpc) + for rpcs in slot_rpcs.values(): + rpcs.sort(key = lambda msg: msg['req_send']) + request = None + for rpc in rpcs: + if 'response' in rpc: + if (request == None or + rpc['req_send'] <= request['req_send']): + # Unmatchable response + self.del_rpc(rpc) + continue + self.merge(request, rpc) + request = None + else: + if request != None: + # Unmatchable request + self.del_rpc(request) + request = rpc + if request != None: + # Unmatchable trailing request + self.del_rpc(request) + + def output(self): + global tcp_rpcs, options + + print('\n------------------') + print('Analyzer: rpcs') + print('------------------') + + if (options.msglen != None or options.rpc_start != None or + options.rtt != None): + print_rpcs = filter_tcp_rpcs(tcp_rpcs.values(), + msglen=options.msglen, + rpc_start=options.rpc_start, + rtt=options.rtt) + print('%d TCP RPCs were selected using the following filters:' % + (len(print_rpcs))) + if options.msglen: + print(' --msglen %s' % (options.msglen)) + if options.rpc_start: + print(' --rpc-start %s' % (options.rpc_start)) + if options.rtt: + print(' --rtt %s' % (options.rtt)) + else: + print_rpcs = tcp_rpcs.values() + print('There are %d TCP RPCs in the traces' % (len(print_rpcs))) + + sort_keys = options.sort + if sort_keys == None: + sort_keys = 'Start' + for key in sort_keys.split(): + if key == 'Start': + print_rpcs = sorted(print_rpcs, key = lambda rpc: + rpc['req_send'] if 'req_send' in rpc else 1e20) + elif key == 'End': + print_rpcs = sorted(print_rpcs, key = lambda rpc: + rpc['resp_recvd'] if 'resp_recvd' in rpc else 1e20) + elif key == 'Rtt': + print_rpcs = sorted(print_rpcs, reverse = True, key = lambda rpc: + rpc['resp_recvd'] - rpc['req_send'] + if 'resp_recvd' in rpc and 'req_send' in rpc else 0) + else: + raise Exception('Unknown sort key \'%s\' for tcp_rpcs ' + 'analyzer' % (key)) + + # Collect and print overall statistics about the RPCs. + xmit = [] + net = [] + free = [] + softirq = [] + recv = [] + srvc = [] + rtt = [] + for rpc in print_rpcs: + if rpc['req_pkts']: + first_req_pkt = rpc['req_pkts'][0] + last_req_pkt = rpc['req_pkts'][-1] + else: + first_req_pkt = [] + last_req_pkt = [] + if rpc['resp_pkts']: + first_resp_pkt = rpc['resp_pkts'][0] + last_resp_pkt = rpc['resp_pkts'][-1] + else: + first_resp_pkt = [] + last_resp_pkt = [] + if 'nic' in first_req_pkt: + xmit.append(first_req_pkt['nic'] - rpc['req_send']) + if 'nic' in first_resp_pkt: + xmit.append(first_resp_pkt['nic'] - rpc['resp_send']) + for pkt in itertools.chain(rpc['req_pkts'], rpc['resp_pkts']): + if 'gro' in pkt and 'nic' in pkt: + net.append(pkt['gro'] - pkt['nic']) + if 'free_tx_skb' in pkt and 'nic' in pkt: + free.append(pkt['free_tx_skb'] - pkt['nic']) + if 'softirq' in pkt and 'gro' in pkt: + softirq.append(pkt['softirq'] - pkt['gro']) + if 'softirq' in last_req_pkt and 'req_recvd' in rpc: + recv.append(rpc['req_recvd'] - last_req_pkt['softirq']) + if 'softirq' in last_resp_pkt and 'resp_recvd' in rpc: + recv.append(rpc['resp_recvd'] - last_resp_pkt['softirq']) + if 'req_recvd' in rpc and 'resp_send' in rpc: + srvc.append(rpc['resp_send'] - rpc['req_recvd']) + if 'req_send' in rpc and 'resp_recvd' in rpc: + rtt.append(rpc['resp_recvd'] - rpc['req_send']) + for l in [xmit, net, free, recv, srvc, rtt]: + l.sort() + + print('\nOverall statistics about the selected RPCs. Most of these ' + 'statistics') + print('combine data from request messages and response messages.') + print('Xmit: Time from sendmsg until driver queued first ' + 'packet for NIC') + print('Net: Time from NIC handoff to GRO receipt for packets') + print('Free: Time from when NIC received packet until packet ' + 'was returned') + print(' to Linux and freed') + print('SoftIrq: Time from when packet was received by GRO until it ' + 'was received') + print(' by SoftIRQ') + print('Recv: Time from SoftIRQ for last packet in a message ' + 'until recvmsg completes') + print('Srvc: Time from recvmsg return on server until ' + 'sendmsg for response') + print('Rtt: Total time from request sendmsg until recvmsg ' + 'completes for response\n') + + print(' Min P10 P50 P90 P99 Max') + pctls = [0, 100, 500, 900, 990, 1000] + print('Xmit %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(xmit, p, '%.1f') for p in pctls)) + print('Net %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(net, p, '%.1f') for p in pctls)) + print('Free %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(free, p, '%.1f') for p in pctls)) + print('SoftIrq %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(softirq, p, '%.1f') for p in pctls)) + print('Recv %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(recv, p, '%.1f') for p in pctls)) + print('Srvc %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(srvc, p, '%.1f') for p in pctls)) + print('Rtt %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(rtt, p, '%.1f') for p in pctls)) + + # Print a summary line for each RPC. + print('\nSummary information for each selected RPC:') + print(print_tcp_rpcs(print_rpcs, header = True), end='') + + if options.verbose: + first = True + print('\nPackets from the selected RPCs (in the same RPC order as ' + 'above):') + for rpc in print_rpcs: + if not first: + print() + print(print_pkts(rpc['req_pkts'], header=first), end='') + print(print_pkts(rpc['resp_pkts'], header=False), end='') + first = False + +#------------------------------------------------ +# Analyzer: tcpdelay +#------------------------------------------------ +class AnalyzeTcpdelay: + """ + Prints information about various delays in the transmission of + TCP packets. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeTcppackets') + + def get_pkt_delays(self, pkt, delays): + """ + Extract delays from a TCP packet, add to lists in delays. + """ + + # Note: negative delays below are probably caused by retransmits; + # ignore them. + if 'xmit' in pkt: + if ('nic' in pkt): + delay = pkt['nic'] - pkt['xmit'] + if (delay >= 0): + delays['nic'].append(delay) + if 'qdisc_xmit' in pkt: + delay = pkt['qdisc_xmit'] - pkt['xmit'] + if (delay >= 0): + delays['qdisc'].append(delay) + if 'nic' in pkt: + if 'gro' in pkt: + delay = pkt['gro'] - pkt['nic'] + if (delay >= 0): + delays['gro'].append(delay) + if 'free_tx_skb' in pkt: + delay = pkt['free_tx_skb'] - pkt['nic'] + if (delay >= 0): + delays['free'].append(delay) + if 'free_tx_skb' in pkt and 'gro' in pkt: + delay = pkt['gro'] - pkt['free_tx_skb'] + if (delay >= 0): + delays['net'].append(delay) + if 'xmit' in pkt and 'gro' in pkt: + delay = pkt['gro'] - pkt['xmit'] + if (delay >= 0): + delays['total'].append(delay) + + def output(self): + global tcp_packets + + # Each of the following dictionaries holds lists of delays + # experienced by TCP packets; each variable covers a different + # range of packet lengths. The dictionary keys are: + # nic: Delay from 'xmit' to 'nic' + # qdisc: Delay from 'xmit' to 'qdisc_xmit' if 'qdisc_xmit' is + # present, else 0 + # gro: Delay from 'nic' to 'gro' + # free: Delay from 'nic' to 'free_tx_skb' + # net: Delay from 'free_tx_skb' to 'gro' + # total: Delay from 'nic' to 'gro' + short = defaultdict(list) + medium = defaultdict(list) + long = defaultdict(list) + ack = defaultdict(list) + + short_limit = 1500 + medium_limit = 10000 + for pkt in tcp_packets.values(): + if not 'tso_length' in pkt: + continue + tso_length = pkt['tso_length'] + if tso_length == 0: + self.get_pkt_delays(pkt, ack) + elif tso_length <= short_limit: + self.get_pkt_delays(pkt, short) + elif tso_length <= medium_limit: + self.get_pkt_delays(pkt, medium) + else: + self.get_pkt_delays(pkt, long) + + print('\n------------------') + print('Analyzer: tcpdelay') + print('------------------') + print('Delays in the transmission of TCP packets (all times in usecs):') + print('Xmit: Time from ip*xmit call until driver queued packet for NIC') + print('Qdisc: Time from ip*xmit call until homa_qdisc released ' + 'packet for') + print(' transmission (deferred packets only)') + print('Gro: Time from when NIC received packet until GRO started ' + 'processing') + print('Free: Time from when NIC received packet until packet was ' + 'returned to Linux') + print(' and freed (large values caused by queuing in NIC)') + print('Net: Slight underestimate of time from when NIC ' + 'transmitted packet') + print(' until GRO processing started (Gro - Free)') + print('Total: Time from ip*xmit call until GRO started processing') + + def print_pcts(delays): + if not delays: + return ' 0' + delays.sort() + return '%6d %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f' % ( + len(delays), delays[0], delays[10*len(delays)//100], + delays[50*len(delays)//100], delays[90*len(delays)//100], + delays[99*len(delays)//100], delays[len(delays)-1], + sum(delays)/len(delays)) + print('\nPhase Count Min P10 P50 P90 P99 Max Avg') + print('----------------------------------------------------------------') + + print('Data packets <= %d bytes:' % (short_limit)) + print('Xmit %s' % print_pcts(short['nic'])) + print('Qdisc %s' % print_pcts(short['qdisc'])) + print('Gro %s' % print_pcts(short['gro'])) + print('Free %s' % print_pcts(short['free'])) + print('Net %s' % print_pcts(short['net'])) + print('Total %s' % print_pcts(short['total'])) + + print('\nData packets %d-%d bytes:' % (short_limit + 1, medium_limit)) + print('Xmit %s' % print_pcts(medium['nic'])) + print('Qdisc %s' % print_pcts(medium['qdisc'])) + print('Gro %s' % print_pcts(medium['gro'])) + print('Free %s' % print_pcts(medium['free'])) + print('Net %s' % print_pcts(medium['net'])) + print('Total %s' % print_pcts(medium['total'])) + + print('\nData packets > %d bytes:' % (medium_limit)) + print('Xmit %s' % print_pcts(long['nic'])) + print('Qdisc %s' % print_pcts(long['qdisc'])) + print('Gro %s' % print_pcts(long['gro'])) + print('Free %s' % print_pcts(long['free'])) + print('Net %s' % print_pcts(long['net'])) + print('Total %s' % print_pcts(long['total'])) + + print('\nAcks:') + print('Xmit %s' % print_pcts(ack['nic'])) + print('Qdisc %s' % print_pcts(ack['qdisc'])) + print('Gro %s' % print_pcts(ack['gro'])) + print('Free %s' % print_pcts(ack['free'])) + print('Net %s' % print_pcts(ack['net'])) + print('Total %s' % print_pcts(ack['total'])) + + # Print stats for P98-99 small packets + p99 = defaultdict(list) + delays = sorted(short['total']) + if delays: + min_delay = delays[98*len(delays)//100] + max_delay = delays[99*len(delays)//100] + for pkt in tcp_packets.values(): + if not 'tso_length' in pkt or not 'xmit' in pkt or not 'gro' in pkt: + continue + if pkt['tso_length'] > short_limit: + continue + delay = pkt['gro'] - pkt['xmit'] + if delay >= min_delay and delay <= max_delay: + self.get_pkt_delays(pkt, p99) + + print('\nP98-P99 packets <= %d bytes:' % (short_limit)) + print('Xmit %s' % print_pcts(p99['nic'])) + print('Qdisc %s' % print_pcts(p99['qdisc'])) + print('Gro %s' % print_pcts(p99['gro'])) + print('Free %s' % print_pcts(p99['free'])) + print('Net %s' % print_pcts(p99['net'])) + print('Total %s' % print_pcts(p99['total'])) + + # Print stats for P98-99 medium packets + p99 = defaultdict(list) + delays = sorted(medium['total']) + if delays: + min_delay = delays[98*len(delays)//100] + max_delay = delays[99*len(delays)//100] + for pkt in tcp_packets.values(): + if not 'tso_length' in pkt or not 'xmit' in pkt or not 'gro' in pkt: + continue + tso_length = pkt['tso_length'] + if tso_length <= short_limit or tso_length > medium_limit: + continue + delay = pkt['gro'] - pkt['xmit'] + if delay >= min_delay and delay <= max_delay: + self.get_pkt_delays(pkt, p99) + + print('\nP98-P99 packets %d-%d bytes:' % (short_limit + 1, medium_limit)) + print('Xmit %s' % print_pcts(p99['nic'])) + print('Qdisc %s' % print_pcts(p99['qdisc'])) + print('Gro %s' % print_pcts(p99['gro'])) + print('Free %s' % print_pcts(p99['free'])) + print('Net %s' % print_pcts(p99['net'])) + print('Total %s' % print_pcts(p99['total'])) + + # Print stats for P98-99 long packets + p99 = defaultdict(list) + delays = sorted(long['total']) + if delays: + min_delay = delays[98*len(delays)//100] + max_delay = delays[99*len(delays)//100] + for pkt in tcp_packets.values(): + if not 'tso_length' in pkt or not 'xmit' in pkt or not 'gro' in pkt: + continue + tso_length = pkt['tso_length'] + if tso_length <= medium_limit: + continue + delay = pkt['gro'] - pkt['xmit'] + if delay >= min_delay and delay <= max_delay: + self.get_pkt_delays(pkt, p99) + + print('\nP98-P99 packets > %d bytes:' % (medium_limit)) + print('Xmit %s' % print_pcts(p99['nic'])) + print('Qdisc %s' % print_pcts(p99['qdisc'])) + print('Gro %s' % print_pcts(p99['gro'])) + print('Free %s' % print_pcts(p99['free'])) + print('Net %s' % print_pcts(p99['net'])) + print('Total %s' % print_pcts(p99['total'])) + + # Print stats for P98-99 acks + p99 = defaultdict(list) + delays = sorted(ack['total']) + if delays: + min_delay = delays[98*len(delays)//100] + max_delay = delays[99*len(delays)//100] + for pkt in tcp_packets.values(): + if not 'tso_length' in pkt or not 'xmit' in pkt or not 'gro' in pkt: + continue + tso_length = pkt['tso_length'] + if tso_length != 0: + continue + delay = pkt['gro'] - pkt['xmit'] + if delay >= min_delay and delay <= max_delay: + self.get_pkt_delays(pkt, p99) + + print('\nAcks:') + print('Xmit %s' % print_pcts(p99['nic'])) + print('Qdisc %s' % print_pcts(p99['qdisc'])) + print('Gro %s' % print_pcts(p99['gro'])) + print('Free %s' % print_pcts(p99['free'])) + print('Net %s' % print_pcts(p99['net'])) + print('Total %s' % print_pcts(p99['total'])) + +#------------------------------------------------ +# Analyzer: tcppackets +#------------------------------------------------ +class AnalyzeTcppackets: + """ + Collects information about each TCP packet but doesn't generate any + output. The data it collects is used by other analyzers. + """ + + def __init__(self, dispatcher): + return + + def tt_tcp_xmit(self, trace, t, core, source, dest, data_bytes, seq_ack): + global tcp_hdr_length + + tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) + # if 'xmit' in tcp_pkt: + # print('%9.3f: Duplicate TCP packet transmission on node %s (previous: %.3f)' % (t, + # trace['node'], tcp_pkt['xmit'])) + node = trace['node'] + tcp_pkt['xmit'] = t + tcp_pkt['xmit2'] = t + tcp_pkt['tso_length'] = data_bytes + tcp_pkt['tx_node'] = node + tcp_pkt['tx_core'] = core + set_tcp_ip_node(source, node) + + def tt_tcp_qdisc(self, trace, t, core, source, dest, data_bytes, seq_ack): + tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) + node = trace['node'] + tcp_pkt['qdisc_xmit'] = t + tcp_pkt['xmit2'] = t + tcp_pkt['tso_length'] = data_bytes + tcp_pkt['tx_node'] = node + set_tcp_ip_node(source, node) + + def tt_tcp_nic(self, trace, t, core, source, dest, data_bytes, seq_ack): + tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) + node = trace['node'] + tcp_pkt['nic'] = t + tcp_pkt['tso_length'] = data_bytes + tcp_pkt['tx_node'] = node + tcp_pkt['nic_core'] = core + set_tcp_ip_node(source, node) + + def tt_tcp_free(self, trace, t, core, source, dest, data_bytes, seq_ack, + qid): + tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) + node = trace['node'] + tcp_pkt['free_tx_skb'] = t + tcp_pkt['tso_length'] = data_bytes + tcp_pkt['tx_qid'] = qid + tcp_pkt['tx_node'] = node + set_tcp_ip_node(source, node) + + def tt_tcp_gro(self, trace, t, core, source, dest, data_bytes, seq_ack): + global tcp_hdr_length + + tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) + node = trace['node'] + tcp_pkt['length'] = data_bytes + tcp_pkt['gro'] = t + tcp_pkt['rx_node'] = node + set_tcp_ip_node(dest, node) + + def tt_tcp_softirq(self, trace, t, core, source, dest, data_bytes, seq_ack): + global tcp_hdr_length + + tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) + node = trace['node'] + tcp_pkt['length'] = data_bytes + tcp_pkt['softirq'] = t + tcp_pkt['rx_node'] = node + set_tcp_ip_node(dest, node) + + def analyze(self): + """ + This method post-processes all of the TCP packets to fill in missing + fields. + """ + global tcp_packets + + # -> list of data packets (nonzero length) from + # source to dest, where source and dest come from fields in packets + # with the same name. + stream_pkts = defaultdict(list) + + # Pass 1: divide data packets into buckets for unidirectional + # streams, and also fill in a fiew fields. + for pkt in tcp_packets.values(): + if not pkt['tx_node']: + node = get_tcp_node(pkt['source']) + if node != None: + pkt['tx_node'] = node + if not pkt['rx_node']: + node = get_tcp_node(pkt['source']) + if node != None: + pkt['rx_node'] = node + if not 'length' in pkt: + if not 'tso_length' in pkt: + print('No tso_length in packet: %s' % (pkt)) + pkt['length'] = pkt['tso_length'] + + if pkt['length'] == 0: + continue + stream_pkts[f"{pkt['source']} {pkt['dest']}"].append(pkt) + + # Pass 2: process the packets in a stream in sequence order, in + # order to copy information from a source TSO packet into each of + # the segments generated from it. + for pkts in stream_pkts.values(): + tso_pkt = None + tso_end = None + for pkt in sorted(pkts, key = lambda pkt: pkt['seq_ack']): + if 'tso_length' in pkt: + tso_pkt = pkt + tso_end = pkt['seq_ack'] + pkt['tso_length'] + continue + if tso_pkt == None or pkt['seq_ack'] >= tso_end: + continue + tso_pkt['segments'].append(pkt) + for field in ['xmit', 'qdisc_xmit', 'xmit2', 'tx_qid', + 'nic', 'free_tx_skb']: + if field in tso_pkt: + pkt[field] = tso_pkt[field] + +#------------------------------------------------ +# Analyzer: temp +#------------------------------------------------ +class AnalyzeTemp: + """ + This analyzer is used to implement temporary checks used during + debugging. Consult the code to see what it does right now. + """ + def __init__(self, dispatcher): + # dispatcher.interest('AnalyzeTcp_rpcs') + # dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeTcppackets') + + def output(self): + global packets, grants, tcp_packets + + # node -> dict of addr -> core, where addr is a sender address and + # core is the GRO core for that address (for Homa) + node_cores = defaultdict(dict) + + for pkt in packets.values(): + if 'gro_core' in pkt and pkt['tx_node'] and pkt['rx_node']: + node_cores[pkt['rx_node']][pkt['tx_node']] = pkt['gro_core'] + + print('\nNode Conflict Max') + total_conflicts = 0 + for node in get_sorted_nodes(): + cores = defaultdict(lambda: 0) + # print('Node %s: %s' % (node, node_cores[node])) + for addr, core in node_cores[node].items(): + cores[core] += 1 + conflicts = 0 + max_conflict = 0 + # print('Node %s core info: %s' % (node, cores)) + for count in cores.values(): + conflicts += count - 1 + if count - 1 > max_conflict: + max_conflict = count - 1 + total_conflicts += conflicts + print('%-8s %3d %3d' % (node, conflicts, max_conflict)) + print('Total conflicts: %d' % (total_conflicts)) + + def output_slow_pkts(self): + pkts = [] + delays = [] + for pkt in packets.values(): + if (pkt['msg_length'] == None or pkt['msg_length'] <= 1000 or + pkt['msg_length'] >= 60000): + continue + if not 'nic' in pkt or not 'gro' in pkt or not 'tso_length' in pkt: + continue + delay = pkt['gro'] - pkt['nic'] + if delay >= 300: + pkts.append(pkt) + else: + if pkt['id'] == 400376130 or pkt['id'] == 400376131: + print('Packet id %d, offset %d, delay %.1f: %s' % + (pkt['id'], pkt['offset'], delay, pkt)) + delays.append(delay) + print('# Packets from messages with length 1000-60000 and') + print('# nic->gro delays > 300 usecs:') + print(print_pkts(pkts), end='') + + def output_delays(self): + global packets, options, rpcs + + delays = [] + + for pkt in packets.values(): + if not 'nic' in pkt or not 'gro' in pkt: + continue + if options.node != None and pkt['tx_node'] != options.node: + continue + if not pkt['id'] in rpcs: + continue + rpc = rpcs[pkt['id']] + if not 'out_length' in rpc: + continue + length = rpc['out_length'] + if length <= 1000 or length > 1400: + continue + delays.append(pkt['gro'] - pkt['nic']) + if not delays: + print('No packets matched!') + return + delays.sort() + plot_ccdf(delays, 'temp_delays.pdf') + print('%d data points, P50 %.1f P90 %.1f P99 %.1f max %.1f' % ( + len(delays), delays[50*len(delays)//100], + delays[90*len(delays)//100], + delays[99*len(delays)//100], delays[-1])) + + def output_slow_rpcs(self): + global packets, rpcs + + matches = [] + max_rpc = None + max_rtt = 0 + for rpc in rpcs.values(): + # print('RPC id %d: %s\n' % (rpc['id'], rpc)) + if not 'sendmsg' in rpc or not 'recvmsg_done' in rpc: + continue + if rpc['out_length'] < 1000 or rpc['out_length'] > 1400: + continue + if rpc['id'] & 1: + continue + rtt = rpc['recvmsg_done'] - rpc['sendmsg'] + if rtt > max_rtt: + max_rpc = rpc + max_rtt = rtt + if rtt >= 150: + matches.append(rpc) + if not matches and max_rpc != None: + matches = [max_rpc] + for rpc in matches: + peer = rpc['peer'] + rtt = rpc['recvmsg_done'] - rpc['sendmsg'] + print('RPC id %d (%s -> %s) took %.1f usecs, length %d, %.3f -> %.3f' % + (rpc['id'], rpc['node'], + ip_to_node[peer] if peer in ip_to_node else peer, + rtt, rpc['out_length'], rpc['sendmsg'], rpc['recvmsg_done'])) + if rpc['send_data_pkts']: + pkt = rpc['send_data_pkts'][0] + if 'nic' in pkt and 'gro' in pkt: + print(' Request packet network time %.1f usecs (nic %.3f, gro %.3f)' % + (pkt['gro'] - pkt['nic'], pkt['nic'], pkt['gro'])) + if rpc['softirq_data_pkts']: + pkt = rpc['softirq_data_pkts'][0] + if 'nic' in pkt and 'gro' in pkt: + print(' Response packet network time %.1f usecs (nic %.3f, gro %.3f)' % + (pkt['gro'] - pkt['nic'], pkt['nic'], pkt['gro'])) + + max_free_delay = 0 + max_pkt = None + max_gro_free = 0 + max_gro_free_pkt = None + for pkt in packets.values(): + if not 'nic' in pkt or not 'free_tx_skb' in pkt: + continue + if not 'tso_length' in pkt or not 'gro' in pkt: + continue + if 'tx_qid' in pkt and pkt['tx_qid'] <= 1: + delay = min(pkt['free_tx_skb'], pkt['gro']) - pkt['nic'] + if delay > max_free_delay: + max_free_delay = delay + max_pkt = pkt + delay = pkt['free_tx_skb'] - pkt['gro'] + if delay > max_gro_free: + max_gro_free = delay + max_gro_free_pkt = pkt + # print("New max_gro_free_pkt: %s" % (pkt)) + print('\nMax NIC delay: %.1f usecs, id %d, offset %d, node %s, nic %.3f, free %.3f, gro %.3f' % + (max_free_delay, max_pkt['id'], max_pkt['offset'], + max_pkt['tx_node'], max_pkt['nic'], max_pkt['free_tx_skb'], + max_pkt['gro'])) + print('\nMax GRO->free delay: %.1f usecs, id %d, offset %d, node %s, nic %.3f, free %.3f, gro %.3f' % + (max_gro_free, max_gro_free_pkt['id'], + max_gro_free_pkt['offset'], max_gro_free_pkt['tx_node'], + max_gro_free_pkt['nic'], max_gro_free_pkt['free_tx_skb'], + max_gro_free_pkt['gro'])) + + def output_snapshot(self): + global packets, rpcs + + # Desired time for snapshot + t = 18000.0 + + # Desired target node + target = 'node3' + + print('\n-------------------') + print('Analyzer: temp') + print('-------------------') + print('Packets incoming to %s at time %.1f' % (target, t)) + + # Node name -> {pkts, bytes} in transit from node at given time. + nodes = {} + + # Core number -> {pkts, bytes} in transit to GRO core at given time. + cores = {} + + # RPC id -> {pkts, bytes} in transit for that RPC at given time. + rpc_counts = {} + + total_packets = 0 + total_bytes = 0 + + for pkt in packets.values(): + if False: + print('Packet: %s' % (pkt)) + missing_fields = False + for field in ['xmit', 'gro', 'id', 'gro_core', 'offset']: + if not field in pkt: + missing_fields = True + break + if missing_fields: + continue + if pkt['xmit'] > t: + continue + if pkt['gro'] < t: + continue + id = pkt['id'] + tx_node = rpcs[id]['node'] + rx_node = rpcs[id^1]['node'] + if rx_node != target: + continue + length = get_recv_length(pkt['offset'], pkt['msg_length']) + total_packets += 1 + total_bytes += length + + if not tx_node in nodes: + nodes[tx_node] = {'pkts': 0, 'bytes': 0} + node = nodes[tx_node] + node['pkts'] += 1 + node['bytes'] += length + + if not pkt['gro_core'] in cores: + cores[pkt['gro_core']] = {'pkts': 0, 'bytes': 0} + core = cores[pkt['gro_core']] + core['pkts'] += 1 + core['bytes'] += length + + if not id in rpc_counts: + rpc_counts[id] = {'pkts': 0, 'bytes': 0} + rpc = rpc_counts[id] + rpc['pkts'] += 1 + rpc['bytes'] += length + + print('\nTotal packets %d, total bytes %d' % (total_packets, total_bytes)) + + print('\nSource nodes:') + print('Node Pkts Bytes') + for name in get_sorted_nodes(): + if not name in nodes: + continue + node = nodes[name] + print('%-10s %5d %8d' % (name, node['pkts'], node['bytes'])) + + + print('\nGRO cores:') + print('Node Pkts Bytes') + for core_num in sorted(cores.keys()): + core = cores[core_num] + print('%4d %5d %8d' % (core_num, core['pkts'], core['bytes'])) + + print('\nRPCs:') + print('Id Pkts Bytes') + for id in sorted(rpc_counts.keys()): + rpc = rpc_counts[id] + print('%-10d %5d %8d' % (id, rpc['pkts'], rpc['bytes'])) + +#------------------------------------------------ +# Analyzer: temp2 +#------------------------------------------------ +class AnalyzeTemp2: + """ + This analyzer is used to implement temporary checks used during + debugging. Consult the code to see what it does right now. + """ + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeTcppackets') + + def output(self): + global packets + + tcp_headers = 20 + 20 + 18 + homa_headers = 56 + 20 + 18 + data_bytes = 0 + total_bytes = 0 + pkts = 0 + for pkt in itertools.chain(packets.values(), tcp_packets.values()): + if pkt['tx_node'] != 'node4': + continue + if not 'nic' in pkt or pkt['nic'] < 17750 or pkt['nic'] >= 17950: + continue + if not 'tso_length' in pkt: + continue + bytes = pkt['tso_length'] + data_bytes += bytes + if pkt['type'] == 0: + total_bytes += bytes + tcp_headers + else: + total_bytes += bytes + homa_headers + pkts += 1 + + print('%d packets, data %d bytes (%.3f usec), total %d bytes (%.3f usec)' + % (pkts, data_bytes, data_bytes / 12.5e03, total_bytes, + total_bytes / 12.5e03)) + +#------------------------------------------------ +# Analyzer: timeline +#------------------------------------------------ +class AnalyzeTimeline: + """ + Prints a timeline showing how long it takes for RPCs to reach various + interesting stages on both clients and servers. Most useful for + benchmarks where all RPCs are the same size. + """ + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeRpcs') + return + + def output(self): + global rpcs + num_rpcs = 0 + print('\n-------------------') + print('Analyzer: timeline') + print('-------------------') + + # These tables describe the phases of interest. Each sublist is + # a triple, where the label is human-readable + # string for the phase, the name selects an element of an RPC, and + # the lambda extracts a time from the RPC element. + client_phases = [ + ['first request packet sent', 'send_data', lambda x : x[0][0]], + ['softirq gets first grant', 'softirq_grant',lambda x : x[0][0]], + ['last request packet sent', 'send_data', lambda x : x[-1][0]], + ['gro gets first response packet','gro_data', lambda x : x[0][0]], + ['softirq gets first response pkt','softirq_data', lambda x : x[0][0]], + ['sent grant', 'send_grant', lambda x : x[0][0]], + ['gro gets last response packet', 'gro_data', lambda x : x[-1][0]], + ['homa_recvmsg returning', 'recvmsg_done', lambda x : x] + ] + client_extra = [ + ['finished copying req into pkts','copy_in_done', lambda x : x], + ['started copying to user space', 'copy_out_start',lambda x : x], + ['finished copying to user space','copy_out_done', lambda x : x] + ] + + server_phases = [ + ['gro gets first request packet', 'gro_data', lambda x : x[0][0]], + ['softirq gets first request pkt', 'softirq_data', lambda x : x[0][0]], + ['sent grant', 'send_grant', lambda x : x[0][0]], + ['gro gets last request packet', 'gro_data', lambda x : x[-1][0]], + ['homa_recvmsg returning', 'recvmsg_done', lambda x : x], + ['homa_sendmsg response', 'sendmsg', lambda x : x], + ['first response packet sent', 'send_data', lambda x : x[0][0]], + ['softirq gets first grant', 'softirq_grant', lambda x : x[0][0]], + ['last response packet sent', 'send_data', lambda x : x[-1][0]] + ] + server_extra = [ + ['started copying to user space', 'copy_out_start', lambda x : x], + ['finished copying to user space','copy_out_done', lambda x : x], + ['finished copying req into pkts','copy_in_done', lambda x : x] + ] + + # One entry in each of these lists for each phase of the RPC, + # values are lists of times from RPC start (or previous phase) + client_totals = [] + client_deltas = [] + client_extra_totals = [] + client_extra_deltas = [] + server_totals = [] + server_deltas = [] + server_extra_totals = [] + server_extra_deltas = [] + + # Collect statistics from all of the RPCs. + for id, crpc in rpcs.items(): + # Find matching and complete pairs of client-side and + # serve-side RPCs. + if id & 1: + continue + if not ((id^1) in rpcs): + continue + srpc = rpcs[id^1] + if (not 'sendmsg' in crpc) or (not 'recvmsg_done' in crpc): + continue + if (not crpc['gro_data']) or (crpc['gro_data'][0][1] != 0) \ + or (not crpc['send_data']): + continue + num_rpcs += 1 + + start = crpc['sendmsg'] + self.__collect_stats(client_phases, crpc, start, client_totals, + client_deltas) + self.__collect_stats(client_extra, crpc, start, client_extra_totals, + client_extra_deltas) + self.__collect_stats(server_phases, srpc, start, server_totals, + server_deltas) + self.__collect_stats(server_extra, srpc, start, server_extra_totals, + server_extra_deltas) + + if client_totals: + print('\nTimeline for clients (%d RPCs):\n' % (num_rpcs)) + self.__print_phases(client_phases, client_totals, client_deltas) + print('') + self.__print_phases(client_extra, client_extra_totals, + client_extra_deltas) + if server_totals: + print('\nTimeline for servers (%d RPCs):\n' % (num_rpcs)) + self.__print_phases(server_phases, server_totals, server_deltas) + print('') + self.__print_phases(server_extra, server_extra_totals, + server_extra_deltas) + + def __collect_stats(self, phases, rpc, start, totals, deltas): + """ + Utility method used by print to aggregate delays within an RPC + into buckets corresponding to different phases of the RPC. + phases: Describes the phases to aggregate + rpc: Dictionary containing information about one RPC + start: Starting time for RPC on client-side + totals: Total delays from start of the RPC are collected here + deltas: Delays from one phase to the next are collected here + """ + + while len(phases) > len(totals): + totals.append([]) + deltas.append([]) + prev = start + for i in range(len(phases)): + phase = phases[i] + if phase[1] in rpc: + rpc_phase = rpc[phase[1]] + if rpc_phase: + t = phase[2](rpc_phase) + totals[i].append(t - start) + deltas[i].append(t - prev) + prev = t + + def __print_phases(self, phases, totals, deltas): + """ + Utility method used by print to print out summary statistics + aggregated by __phase_stats + """ + for i in range(0, len(phases)): + label = phases[i][0] + if not totals[i]: + print('%-32s (no events)' % (label)) + continue + elapsed = sorted(totals[i]) + gaps = sorted(deltas[i]) + print('%-32s Avg %7.1f us (+%7.1f us) P90 %7.1f us (+%7.1f us)' % + (label, sum(elapsed)/len(elapsed), sum(gaps)/len(gaps), + elapsed[9*len(elapsed)//10], gaps[9*len(gaps)//10])) + +#------------------------------------------------ +# Analyzer: txintervals +#------------------------------------------------ +class AnalyzeTxintervals: + """ + Computes statistics related to packet transmission over intervals, + and generates one data file for each node showing interval data for + that node. Requires the --data and --gbps options. Also uses the + --interval option. If --tx-qid is specified, then only packets transmitted + via that qid will be considered. + """ + + def __init__(self, dispatcher): + interval_analyzer = dispatcher.interest('AnalyzeIntervals') + if options.tx_qid != None: + interval_analyzer.restrict_qid(options.tx_qid) + return + + def output(self): + global intervals, options, traces + + print('\n---------------------') + print('Analyzer: txintervals') + print('---------------------') + if options.data == None: + print('--data option wasn\'t specified, so no output generated.') + return + print('See data files txintervals_*.dat in %s\n' % (options.data)) + print('Average transmit throughput:') + + if options.tx_qid != None: + qid_msg = ' (considers only transmit queue %d)' % (options.tx_qid) + else: + qid_msg = '' + + for node in get_sorted_nodes(): + f = open('%s/txintervals_%s.dat' % (options.data, node), 'w') + f.write('# Node: %s\n' % (node)) + f.write('# Generated at %s.\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + f.write('# Statistics about message transmission from node ') + f.write('%s over %d usec\n' % (node, options.interval)) + f.write('# intervals%s:\n' % (qid_msg)) + f.write('# Time: End of the time interval\n') + f.write('# Gbps: Rate of data passed to ip*xmit during ' + 'the interval\n') + f.write('# TxKB: KB of data passed to ip*xmit during ' + 'the interval\n') + f.write('# RPCs: Number of live client RPCs at the end ' + 'of the interval\n') + f.write('# Reqs: Request messages that have been started ' + 'but not fully\n') + f.write('# transmitted as of the end of the interval\n') + f.write('# Resps: Response messages that have been started ' + 'but not fully\n') + f.write('# transmitted as of the end of the interval\n') + f.write('# Pkts: Packets transmitted during the interval\n') + f.write('# QDisc: KB of data that have been passed to ip*xmit ' + 'but not yet\n') + f.write('# passed to the NIC, as of the end of the ' + 'interval\n') + f.write('# NicKB: KB of data passed to NIC during the interval\n') + f.write('# NQEst: Estimate of NIC queue length at the end ' + 'of the interval,\n') + f.write('# measured in usecs to xmit (assumes the NIC ' + 'can transmit at\n') + f.write('# link speed)\n') + f.write('# InNic: KB of data that have been queued for the ' + 'NIC but whose packets\n') + f.write('# have not yet been returned after ' + 'transmission\n') + f.write('# NicRx: KB of data that are still in the NIC\'s ' + 'possession (their packets\n') + f.write('# haven\'t been returned after transmission) ' + 'even though the data\n') + f.write('# has been received by the destination, as ' + 'of the end of the\n') + f.write('# interval\n') + f.write('# FreeKB: KB of skb data freed after NIC notified ' + 'transmission complete\n') + f.write('# MinFr: Smallest p[\'free_tx_skb\'] - p[\'nic\'] for a ' + 'packet passed to\n') + f.write('# NIC in this interval\n') + f.write('# MaxFr: Largest p[\'free_tx_skb\'] - p[\'nic\'] for a ' + 'packet passed to\n') + f.write('# NIC in this interval\n') + f.write('# MinGF: Smallest p[\'gro\'] - p[\'free_tx_skb\'] ' + 'for any segment of a\n') + f.write('# packet passed to NIC in this interval\n') + f.write('# MaxGF: Largest p[\'gro\'] - p[\'free_tx_skb\'] ' + 'for any segment of a\n') + f.write('# packet passed to NIC in this interval\n') + f.write('# GXmit: KB of grants that have been sent by peer ' + 'but not yet\n') + f.write(' received by GRO\n') + f.write('# GGro: KB of grants that have been received by GRO ' + 'but not yet\n') + f.write(' received by SoftIRQ\n') + f.write('# GAvail: KB of grants that have been received by ' + 'SoftIRQ but data hasn\'t\n') + f.write(' been transmitted yet\n') + f.write('# GNew: KB of new grants received by SoftIRQ ' + 'during the interval\n') + + f.write('\n# Time Gbps TxKB RPCs Reqs Resps') + f.write(' Pkts Qdisc NicKB NQEst InNic NicRx FreeKB') + f.write(' MinFr MaxFr MinGF MaxGF') + f.write(' GXmit GGro GAvail GNew\n') + total = 0 + for interval in intervals[node]: + if not 'tx_bytes' in interval: + interval['tx_bytes'] = 0 + # print('Bogus interval: %s' % (interval)) + # print('Trace: %s' % (traces[node])) + gbps = interval['tx_bytes'] * 8 / (options.interval * 1000) + total += gbps + f.write('%8.1f %6.1f %5.0f %5d %5d %5d' % + (interval['time'], gbps, + interval['tx_bytes'] * 1e-3, + interval['rpcs_live'], + interval['tx_live_req'], + interval['tx_live_resp'])) + f.write(' %4d %5.0f %5.0f %5.1f %5.0f %5.0f %5.0f' % ( + interval['tx_pkts'], interval['tx_qdisc'] * 1e-3, + interval['tx_nic_bytes'] * 1e-3, + interval['tx_q'] * 8 / (options.gbps * 1000), + interval['tx_in_nic'] * 1e-3, + interval['tx_nic_rx'] * 1e-3, + interval['tx_free_bytes'] * 1e-3)) + v = interval['tx_min_free'] + min_free = '%.1f' % v if v != 0 else '' + v = interval['tx_max_free'] + max_free = '%.1f' % v if v != 0 else '' + v = interval['tx_min_gro_free'] + min_gro_free= '%.1f' % v if v != None else '' + v = interval['tx_max_gro_free'] + max_gro_free = '%.1f' % v if v != None else '' + f.write(' %6s %6s %6s %6s' % (min_free, max_free, min_gro_free, + max_gro_free)) + f.write(' %5.0f %5.0f %5.0f %5.0f\n' + % (interval['tx_grant_xmit'] * 1e-3, + interval['tx_grant_gro'] * 1e-3, + interval['tx_grant_avl'] * 1e-3, + interval['tx_new_grants'] * 1e-3)) + f.close() + print('%-10s %6.1f Gbps' % (node, total/len(intervals[node]))) + +#------------------------------------------------ +# Analyzer: txpkts +#------------------------------------------------ +class AnalyzeTxpkts: + """ + Generates one data file for each node showing information about every + data packet transmitted from that node, in time order. If either --node or + --tx-qid is specified, only packets matching those options will be + considered. Packets will normally be sorted by the 'Xmit' column, but the + --sort option can be used to specify a different column to use for sorting. + Also generates aggregate statistics for each tx queue on each node. + """ + + def __init__(self, dispatcher): + global options + require_options('txpkts', 'data') + dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeTcppackets') + + def output(self): + global packets, tcp_packets, options, traces + + # node -> list of packets transmitted by that node + node_pkts = defaultdict(list) + + # Bucket all of the packets by transmitting node. + for pkt in itertools.chain(packets.values(), tcp_packets.values()): + if not 'xmit' in pkt or not 'tso_length' in pkt: + continue + if not 'gro' in pkt: + continue + node_pkts[pkt['tx_node']].append(pkt) + + print('\n----------------') + print('Analyzer: txpkts') + print('----------------') + print('See data files txpkts_*.dat in %s\n' % (options.data)) + print('Summary statistics on delays related to outgoing packets:') + print('Node: Name of node') + print('Qid: Identifier of transmit queue') + print('TxQueue: Address of netdev_queue struct for Qid') + print('Tsos: Total number of TSO frames transmitted by node ' + 'or queue') + print('Segs: Total number of segments (packets received by GRO) ' + 'transmitted by') + print(' node or queue') + print('Gbps: Throughput of that queue') + print('PTsos: Total number of TSO frames that were transmitted ' + 'by the pacer') + print('QTsos: Total number of TSO frames that were deferred by ' + 'homa_qdisc to') + print(' limit NIC queue length') + print('Backlog: Average KB of tx data that were in the ' + 'posession of the NIC') + print(' (presumably without being transmitted) longer than ' + '%d usec' % (options.threshold)) + print(' (%d is the value of the --threshold option)' + % (options.threshold)) + print('BFrac: Fraction of all bytes passing through this queue ' + 'that spent more') + print(' than %d usec in the NIC (where %d is the value of ' + 'the --threshold' % (options.threshold, options.threshold)) + print(' option)') + print('NicP10: 10th percentile of NIC delay (time from xmit to NIC ' + 'handoff)') + print('NicP50: Median NIC delay') + print('NicP90: 90th percentile of NIC delay') + print('GroP10: 10th percentile of GRO delay (time from NIC handoff ' + 'to receipt by') + print(' destination GRO)') + print('GroP50: Median GRO delay') + print('GroP90: 90th percentile of GRO delay') + print('FreP10: 10th percentile of free delay (time from NIC handoff ' + 'to freeing') + print(' transmit packet buffer)') + print('FreP50: Median free delay') + print('FreP90: 90th percentile of free delay') + + first_node = True + node_info = '' + q_details = '' + for node in get_sorted_nodes(): + if (options.node != None) and (node != options.node): + continue + + # Tx queue number -> dictionary mapping from delay type to a list + # of delays of the given type on the given transmit queue. + # Delay types currently used: + # nic: delay from xmit to nic doorbell + # gro: delay from xmit to gro on receiver + # free: delay from xmit to sk_buff free on sender + delays = defaultdict(lambda: defaultdict(list)) + + # Tx queue number -> total number of TSO frames transmitted on + # that queue + qid_tsos = defaultdict(lambda: 0) + + # Tx queue number -> total number of packets (segments) transmitted + # on that queue + qid_segs = defaultdict(lambda: 0) + + # Tx queue number -> total number of bytes transmitted on that + # queue + qid_bytes = defaultdict(lambda: 0) + + # Tx queue number -> total number of TSO frames transmitted + # by the pacer on that queue + qid_pacer_tsos = defaultdict(lambda: 0) + + # Tx queue number -> total number of TSO frames on that queue + # that were deferred by homa_qdisc because of NIC queue overload + qid_qdisc_tsos = defaultdict(lambda: 0) + + # Tx queue number -> integral of (excess time * KB) for TSO packets + # that have spent "too much time" in the NIC. Excess time is + # pkt['tx_fre_skb'] - pkt['nic'] - options.threshold, and KB is + # the length of the TSO packet. + qid_backlog = defaultdict(lambda: 0.0) + + # Tx queue number -> total number of bytes that passed through + # this queue. + qid_total_bytes = defaultdict(lambda: 0) + + # Tx queue number -> total number of messages bytes that spent + # longer than options.threshold in the NIC. + qid_slow_bytes = defaultdict(lambda: 0) + + # Tx queue number -> hex address of netdev_queue + qid_tx_queue = defaultdict(lambda: '') + + total_pkts = 0 + + # Select packets to print for this node, plus gather statistics. + pkts = [] + for pkt in node_pkts[node]: + xmit = pkt['xmit'] + gro = pkt['gro'] if 'gro' in pkt else None + free = pkt['free_tx_skb'] if 'free_tx_skb' in pkt else None + nic = pkt['nic'] if 'nic' in pkt else None + qdisc = pkt['qdisc_xmit'] if 'qdisc_xmit' in pkt else None + qid = pkt['tx_qid'] if 'tx_qid' in pkt else None + length = pkt['tso_length'] + + nic_delay = None + if nic != None: + nic_delay = nic - xmit + + if qid != None: + qid_tsos[qid] += 1 + segs = 1 + if 'segments' in pkt: + segs += len(pkt['segments']) + qid_segs[qid] += segs + if 'tso_length' in pkt: + qid_bytes[qid] += length + if 'pacer' in pkt: + qid_pacer_tsos[qid] += 1 + if qdisc != None: + qid_qdisc_tsos[qid] += 1 + if 'tx_queue' in pkt: + qid_tx_queue[qid] = pkt['tx_queue'] + if (options.tx_qid == None) or (qid == options.tx_qid): + pkts.append(pkt) + else: + if options.tx_qid == None: + pkts.append(pkt) + total_pkts += 1 + + if (len(pkt['retransmits']) == 0 and qid != None and + nic_delay != None): + delays[qid]['nic'].append(nic_delay) + if gro != None: + delays[qid]['gro'].append(gro - nic) + if free != None: + delays[qid]['free'].append(free - nic) + + if nic != None: + t = free if free != None else traces[node]['last_time'] + excess = (t - nic) - options.threshold + if excess > 0: + qid_backlog[qid] += excess * length + qid_slow_bytes[qid] += length + qid_total_bytes[qid] += length + + # Create a data file for this node with packets in time order + # (or whatever order was requested on the command line). + key = options.sort + if key == None: + key = 'Xmit' + pkts = sort_pkts(pkts, key) + + f = open('%s/txpkts_%s.dat' % (options.data, node), 'w') + f.write('# Node: %s\n' % (node)) + f.write('# Generated at %s.\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + f.write('# Data packets transmitted from %s:\n' % (node)) + f.write(print_pkts(pkts, comment=True)) + f.close() + + def print_type(delays): + delays.sort() + count = len(delays) + if count > 0: + return '%6.1f %6.1f %6.1f' % (delays[10*count//100], + delays[50*count//100], delays[90*count//100]) + return ' '*20 + + # Generate overall statistics by node and qid. + if not first_node: + q_details += '\n' + q_details += 'Transmit queues for %s\n' % (node) + q_details += 'Qid TxQueue Tsos Segs Gbps ' + q_details += 'PTsos QTsos Backlog BFrac NicP10 NicP50 NicP90 ' + q_details += 'GroP10 GroP50 GroP90 FreP10 FreP50 FreP90\n' + q_details += '-----------------------------------' + q_details += '-------------------------------------------------' + q_details += '------------------------------------------\n' + first_node = False + totals = defaultdict(list) + total_time = traces[node]['last_time'] - traces[node]['first_time'] + for qid in sorted(delays.keys()): + q_delays = delays[qid] + for type, d in q_delays.items(): + totals[type].extend(d) + q_details += '%4d %10s %5d %5d %6.2f ' % ( + qid, qid_tx_queue[qid], qid_tsos[qid], qid_segs[qid], + 8e-3 * qid_bytes[qid] / traces[node]['elapsed_time']) + q_details += '%5d %5d %6.1f %5.2f %s %s %s\n' % ( + qid_pacer_tsos[qid], qid_qdisc_tsos[qid], + 1e-3*qid_backlog[qid]/total_time, + qid_slow_bytes[qid]/qid_total_bytes[qid], + print_type(q_delays['nic']), + print_type(q_delays['gro']), + print_type(q_delays['free'])) + node_info += '%-10s %5d %6.1f %5.2f %s %s %s\n' % ( + node, total_pkts, + 1e-3*sum(qid_backlog.values())/total_time, + div_safe(sum(qid_slow_bytes.values()), + sum(qid_total_bytes.values())), + print_type(totals['nic']), + print_type(totals['gro']), + print_type(totals['free'])) + if not node_info: + print('No packet data available') + else: + print('\nNode totals') + print('Node Tsos Backlog BFrac NicP10 NicP50 NicP90 ' + 'GroP10 GroP50 GroP90 FreP10 FreP50 FreP90') + print('------------------------------------------------------' + '------------------------------------------') + print(node_info) + print(q_details, end='') + +#------------------------------------------------ +# Analyzer: txqstop +#------------------------------------------------ +class AnalyzeTxqstop: + """ + Prints information about transmit queue stoppage, where netdev_tx + refuses to transmit packets on a dev_queue because there is too much + data that has been handed off to the NIC but not yet returned after + transmission. + """ + + def __init__(self, dispatcher): + # node -> list of events for that node. Each event is a tuple + # , where queue is the identifier for a + # dev_queue and what is either "stop" or "restart". Events are + # not guaranteed to be in time order. + self.events = defaultdict(list) + + # node -> maximum queue length limit observed for that node + self.max_limit = defaultdict(lambda: 0) + + # node -> maximum queue length limit observed for that node + self.min_limit = defaultdict(lambda: 1e20) + + def init_trace(self, trace): + # queue identifier -> 1. An entry exists for a queue if a + # queue stoppage event has been seen for that queue (used to + # fill in missing stop events). + self.stopped = {} + + # Name of node for the current trace file. + self.node = trace['node'] + + def tt_txq_stop(self, trace, t, core, queue, limit, queued): + self.stopped[queue] = 1 + self.events[self.node].append([t, queue, 'stop']) + if limit > self.max_limit[self.node]: + self.max_limit[self.node] = limit + if limit < self.min_limit[self.node]: + self.min_limit[self.node] = limit + + def tt_txq_restart(self, trace, t, core, queue): + if not queue in self.stopped: + self.events[self.node].append([trace['first_time'], queue, 'stop']) + self.events[self.node].append([t, queue, 'restart']) + + def output(self): + + print('\n-----------------') + print('Analyzer: txqstop') + print('-----------------') + print() + print('Statistics on dev_queues that have been stopped by Linux ' + 'because there') + print('are too many bytes of packet data currently in the NIC\'s ' + 'possession for') + print('that queue:') + print('Node: Node whose data follows on this line') + print('Stopped: Fraction of time when at least one txq was stopped') + print('Avg: Average number of txqs stopped') + print('Stop1: Fraction of time when 1 txq was stopped') + print('Stop2: Fraction of time when 2 txqs were stopped') + print('Stop3: Fraction of time when 3 txqs were stopped') + print('StopMany: Fraction of time when >3 txqs were stopped') + print('LimitMin: Minimum observed value of length limit for a txq') + print('LimitMax: Maximum observed value of length limit for a txq') + print() + print('Node Stopped Avg Stop1 Stop2 Stop3 StopMany LimitMin LimitMax') + + for node in get_sorted_nodes(): + # queue identifier -> 1 if that queue is currently stopped; + # no entry if queue is running + stopped = {} + + # Used to compute the average number of queues stopped; sum of + # (time_delta * stopped) + avg_stopped = 0 + + # Total time that [1, 2, 3, >3] queues were stopped. + stop_time = [0, 0, 0, 0] + + # Time of last event processed. + prev_t = traces[node]['first_time'] + + if not self.events[node]: + print('%-8s No queue stoppage events detected' % (node)) + continue + + for event in sorted(self.events[node], key = lambda t: t[0]): + t, queue, what = event + + interval = t - prev_t + num_stopped = len(stopped) + if num_stopped > 0: + avg_stopped += interval * num_stopped + index = num_stopped - 1 if num_stopped <= 3 else 3 + stop_time[index] += interval + # if num_stopped > 3: + # print('%9.3f: %d queues stopped on %s: %s' % (t, + # num_stopped, node, sorted(stopped.keys()))) + if what == 'stop': + stopped[queue] = 1 + elif what == 'restart': + if queue in stopped: + del stopped[queue] + else: + raise Exception('Bad \'what\' field in txqstop event: %s' % + (what)) + prev_t = t + + total_t = prev_t - traces[node]['first_time'] + print('%-8s %5.3f %6.2f %5.3f %5.3f %5.3f %5.3f %7d %7d' % ( + node, sum(stop_time) / total_t, avg_stopped / total_t, + stop_time[0] / total_t, stop_time[1] / total_t, + stop_time[2] / total_t, stop_time[3] / total_t, + self.min_limit[node], self.max_limit[node])) + +#------------------------------------------------ +# Analyzer: txsnapshot +#------------------------------------------------ +class AnalyzeTxsnapshot: + """ + Prints information about the state of outgoing messages from a particular + node at a given time. Requires the --node and --time options. + """ + + def __init__(self, dispatcher): + global options + require_options('txsnapshot', 'time', 'node') + dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzePackets') + + def get_sorted_ids(self, live_rpcs): + """ + Given the results from collect_live_rpcs, return a list of the + ids in live_rpcs, sorted based on transmission priority (how close + each message is to fully transmitted). + """ + + def sort_key(live_rpcs, id, field): + if id in rpcs: + length = rpcs[id]['out_length'] + if length == None: + length = 0 + else: + length = 0 + if not field in live_rpcs[id]: + print('Missing field %s in id %d: %s' % (field, id, live_rpcs[id])) + return length - live_rpcs[id][field] + + sorted_ids = sorted(live_rpcs.keys(), + key = lambda id : live_rpcs[id]['pre_xmit2'], + reverse = True) + sorted_ids = sorted(sorted_ids, + key = lambda id : rpcs[id]['sendmsg'] + if (id in rpcs) and ('sendmsg' in rpcs[id]) else 0) + sorted_ids = sorted(sorted_ids, + key = lambda id : sort_key(live_rpcs, id, 'pre_xmit2')) + return sorted_ids + + def output(self): + global packets, rpcs, options, traces + + live_rpcs = AnalyzeRxsnapshot.collect_live_rpcs(options.node, + options.time, False) + sorted_ids = self.get_sorted_ids(live_rpcs) + + for id, rpc in rpcs.items(): + if rpc['node'] != options.node: + continue + if not 'tx_live' in rpc: + continue + start, end = rpc['tx_live'] + if (start > options.time) or (end <= options.time): + continue + if not id in live_rpcs: + print('RPC id %d not in live_rpcs: %s\n' % (id, rpc)) + + print('\n--------------------') + print('Analyzer: txsnapshot') + print('--------------------') + print('A snapshot of outgoing messages from %s at time %.1f' % ( + options.node, options.time)) + + print('\n%d RPCs have live outgoing messages:' % + (len(live_rpcs))) + print('Id: RPC identifier on the sender side') + print('Peer: Receiving node') + print('Length: Length of outgoing message, if known') + print('Window: Bytes that have been granted but not transmitted ' + '(Gsoft - Xmit)') + print('TxRem: Bytes in message that have not yet been transmitted ' + '(Length - Xmit);') + print(' smaller means higher SRPT priority for transmission') + print('Gxmit: Highest offset for which grant has been passed ' + 'to ip_*xmit') + print('GGro: Highest offset in grant that has been received by GRO') + print('GSoft: Highest offset in grant that has been processed ' + 'by SoftIRQ') + print('Xmit: Offset just after last data byte that has been ' + 'passed to ip*xmit') + print(' or requeued by homa_qdisc after deferral') + print('Gro: Offset just after last data byte that has been ' + 'processed by GRO') + print('SoftIrq: Offset just after last data byte that has been ' + 'processed by SoftIRQ') + print('Copied: Offset just after last data byte that has been ' + 'copied to user space') + print('Incoming: Gxmit - SoftIrq') + print('Lost: Packets that appear to have been dropped in the network') + print(' Id Peer Length Window TxRem GXmit ', + end='') + print('GGro GSoft Xmit Gro SoftIrq Copied Incoming Lost') + print('---------------------------------------------------------', + end='') + print('----------------------------------------------------------') + + for id in sorted_ids: + tx_rpc = rpcs[id] + live_rpc = live_rpcs[id] + incoming = (live_rpc['pre_grant_xmit'] - live_rpc['pre_softirq'] + if live_rpc['pre_grant_xmit'] > 0 else 0) + window = live_rpc['pre_grant_softirq'] - live_rpc['pre_xmit2'] + if window > 0: + window = str(window) + else: + window = "" + print('%10d %-10s %7s %7s %7s %7s ' % (id, get_rpc_node(id^1), + tx_rpc['out_length'] if tx_rpc['out_length'] != None else "", + window, + tx_rpc['out_length'] - live_rpc['pre_xmit2'] + if tx_rpc['out_length'] != None else "", + str(live_rpc['pre_grant_xmit']) + if live_rpc['pre_grant_xmit'] > 0 else ""), end='') + print('%7s %7s %7d %7d %7d %7d %7d %4d' % ( + str(live_rpc['pre_grant_gro']) + if live_rpc['pre_grant_gro'] > 0 else "", + str(live_rpc['pre_grant_softirq']) + if live_rpc['pre_grant_softirq'] > 0 else "", + live_rpc['pre_xmit2'], + live_rpc['pre_gro'], live_rpc['pre_softirq'], + live_rpc['pre_copied'], incoming, live_rpc['lost'])) + +# Parse command-line options. +parser = OptionParser(description= + 'Read in one or more Homa timetrace files and use one or more ' + 'analyzers to print information extracted from the file(s). The ' + 'trace files should be synchronized (collected at about the same ' + 'time and clock-synced with ttsync.py). Command-line arguments ' + 'determine which analyzers to apply and provide additional ' + 'parameters for the analyzers.', + usage='%prog [options] [trace trace ...]', + conflict_handler='resolve') +parser.add_option('--analyzers', '-a', dest='analyzers', default='all', + metavar='A', help='Space-separated list of analyzers to apply to ' + 'the trace files (default: all)') +parser.add_option('--core', dest='core', type=int, default=None, + metavar='C', help='Specifies the number of a particular core of ' + 'interest; required by some analyzers') +parser.add_option('--data', '-d', dest='data', default=None, + metavar='DIR', help='If this option is specified, analyzers will ' + 'output data files (suitable for graphing) in the directory given ' + 'by DIR. If this option is not specified, no data files will ' + 'be generated') +parser.add_option('--filter', dest='filter', default=None, + metavar='FUNC', help='\'filter_FUNC\' is the name of a function in the ' + 'analyzer class; used by some analyzers as an additional filter for ' + 'packets.') +parser.add_option('--gbps', dest='gbps', type=float, default=100.0, + metavar='G', help='Link speed in Gbps (default: 100); used by some ' + 'analyzers.') +parser.add_option('--grolat', dest='grolat', default=None, + metavar='L', help='Used by some analyzers to filter packets based on ' + 'the elapsed time from when the packet was passed to ip*xmit until ' + 'it was received by GRO on the destination; it can contain either a ' + 'single floating-point value (minimum latency) or two values (min and ' + 'max, inclusive).') +parser.add_option('-h', '--help', dest='help', action='store_true', + help='Show this help message and exit') +parser.add_option('--interval', dest='interval', type=int, default=50, + metavar='T', help='Specifies the length of intervals for ' + 'interval-based output, in microseconds (default: 20)') +parser.add_option('--late', dest='late', type=int, default=100, + metavar='T', help='Specifies how long a packet must be delayed ' + 'before it is considered overdue, in microseconds (default: 100)') +parser.add_option('--max', dest='max', type=float, default=None, + metavar='T', help='Upper bound to consider for some parameter; ' + 'specific meaning depends on analyzer') +parser.add_option('--max-rtt', dest='max_rtt', type=float, default=None, + metavar='T', help='Only consider RPCs with RTTs <= T usecs. Used by ' + 'rpc analyzer to select which specific RTTs to print out.') +parser.add_option('--min', dest='min', type=float, default=None, + metavar='T', help='Lower bound to consider for some parameter; ' + 'specific meaning depends on analyzer') +parser.add_option('--msglen', dest='msglen', default=None, + metavar='L', help='Used by some analyzers to filter packets based on ' + 'message length; it can contain either a single integer value ' + '(largest allowable length) or two values (min and max, inclusive).') +parser.add_option('--negative-ok', action='store_true', default=False, + dest='negative_ok', + help='Don\'t print warnings when negative delays are encountered') +parser.add_option('--node', dest='node', default=None, + metavar='N', help='Specifies a particular node (the name of its ' + 'trace file without the extension); required by some analyzers') +parser.add_option('--pkt', dest='pkt', default=None, + metavar='ID:OFF', help='Identifies a specific packet with ID:OFF, ' + 'where ID is the RPC id on the sender (even means request message, ' + 'odd means response) and OFF is an offset in the message; if this ' + 'option is specified, some analyzers will output information specific ' + 'to that packet.') +parser.add_option('--plot', '-p', dest='plot', default=None, + metavar='DIR', help='Some analyzers can generate data plots, but ' + 'they will do so only if this option is specified; DIR gives the ' + 'directory in which to place plots.') +parser.add_option('--pkt-types', dest='pkt_types', default='data', + metavar='T', help='Used by some analyzers to determine which types of ' + 'packets to include for analysis; a list of the values \'data\' for ' + 'Homa data packets, \'tcp\' for TCP packets, and \'grant\' for Homa ' + 'grants, or \'all\' to select all types (default: \'homa\')') +parser.add_option('--rpc-start', dest='rpc_start', default=None, + metavar='T', help='Used by some analyzers to filter RPCs based on ' + 'starting time; contains two values (min and max, inclusive).') +parser.add_option('--rtt', dest='rtt', default=None, + metavar='T', help='Used by some analyzers to filter RPCs based on ' + 'end-to-end round trip time; contains two values (min and max, ' + 'inclusive).') +parser.add_option('--rx-core', dest='rx_core', type=int, default=None, + metavar='C', help='If specified, some analyzers will ignore packets ' + 'transmitted from cores other than C') +parser.add_option('--rx-node', dest='rx_node', default=None, + metavar='N', help='If specified, some analyzers will ignore packets ' + 'received by nodes other than N') +parser.add_option('--same-gro-core', dest='same_gro_core', action="store_true", + default=False, help='If specified, the pass analyzer will only ' + 'consider passing for packets that are processed by GRO on the ' + 'same core') +parser.add_option('--segs', action='store_true', default=False, dest='segs', + help='By default some analyzers will consider only the first segment ' + 'of packets that are segmented by TSO segmentation; if this option ' + 'is specified then they will consider all of the derived segments') +parser.add_option('--sort', dest='sort', default=None, + metavar='S', help='Used by some analyzers to select a field to use ' + 'for sorting packets; legal values and the default depend on the' + 'analyzer') +parser.add_option('--threshold', dest='threshold', type=int, default=50, + metavar='T', help='Used by some analyzers as a threshold time value, ' + 'in microseconds (default: 100)') +parser.add_option('--time', dest='time', type=float, default=None, + metavar='T', help='Time of interest; required by some analyzers') +parser.add_option('--tx-core', dest='tx_core', type=int, default=None, + metavar='C', help='If specified, some analyzers will ignore packets ' + 'transmitted from cores other than C') +parser.add_option('--tx-qid', dest='tx_qid', type=int, default=None, + metavar='C', help='Specifies a transmit queue identifier; used ' + 'by some anlyzers to select a specific queue.') +parser.add_option('--tx-node', dest='tx_node', default=None, + metavar='N', help='If specified, some analyzers will ignore ignore packets ' + 'transmitted by nodes other than N') +parser.add_option('--verbose', '-v', action='store_true', default=False, + dest='verbose', + help='Print additional output with more details') + +(options, tt_files) = parser.parse_args() +if options.help: + parser.print_help() + print("\nAvailable analyzers:") + print_analyzer_help() + exit(0) +if not tt_files: + print('No trace files specified') + exit(1) +if options.data: + os.makedirs(options.data, exist_ok=True) +if options.plot: + os.makedirs(options.plot, exist_ok=True) +if options.pkt: + match = re.match('([0-9]+):([0-9]+)$', options.pkt) + if not match: + print('Bad value "%s" for --pkt option; must be id:offset' + % (options.pkt), file=sys.stderr) + exit(1) + options.pkt_id = int(match.group(1)) + options.pkt_offset = int(match.group(2)) +dispatcher = Dispatcher() +analyzer_classes = [] +for name in options.analyzers.split(): + class_name = 'Analyze' + name[0].capitalize() + name[1:] + if not hasattr(sys.modules[__name__], class_name): + print('No analyzer named "%s"' % (name), file=sys.stderr) + exit(1) + dispatcher.interest(class_name) + analyzer_classes.append(class_name) + +# Parse the timetrace files; this will invoke handlers in the analyzers. +for file in tt_files: + dispatcher.parse(file) + +dispatcher.print_no_matches() + +if options.verbose: + dispatcher.print_stats() + +# Invoke 'analyze' methods in each analyzer, if present, to perform +# postprocessing now that all the trace data has been read. +rpcs_invoked = False +for analyzer in dispatcher.get_analyzers(): + # Special hack: AnalyzeRpcs and AnalyzePackets are mutually dependent, + # but we need to make sure that AnalyzeRpcs is always invoked first. + if analyzer.__class__.__name__ == 'AnalyzePackets' and not rpcs_invoked: + rpc_analyzer = dispatcher.get_analyzer('AnalyzeRpcs') + if rpc_analyzer != None: + rpc_analyzer.analyze() + rpcs_invoked = True + elif analyzer.__class__.__name__ == 'AnalyzeRpcs': + if rpcs_invoked: + continue + rpcs_invoked = True + if hasattr(analyzer, 'analyze'): + # print('Calling %s.analyze' % (type(analyzer).__name__), file=sys.stderr) + analyzer.analyze() + +# Give each analyzer a chance to output its findings (includes +# printing output and generating data files). +for name in analyzer_classes: + analyzer = dispatcher.get_analyzer(name) + if hasattr(analyzer, 'output'): + analyzer.output() \ No newline at end of file diff --git a/util/ttmerge.py b/util/ttmerge.py index f52b7242..8e233aff 100755 --- a/util/ttmerge.py +++ b/util/ttmerge.py @@ -1,18 +1,7 @@ #!/usr/bin/python3 -# Copyright (c) 2019-2022 Stanford University -# -# Permission to use, copy, modify, and distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2019-2022 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ Merge two or more timetraces into a single trace. All of the traces diff --git a/util/ttmlxalloc.py b/util/ttmlxalloc.py deleted file mode 100755 index 0a4d2005..00000000 --- a/util/ttmlxalloc.py +++ /dev/null @@ -1,164 +0,0 @@ -#!/usr/bin/python3 - -# Copyright (c) 2022 Stanford University -# -# Permission to use, copy, modify, and distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - -""" -Scans a time trace file for entries generated by the Mellanox driver about -packet allocations and deallocations (because the per-channel cache -overflowed or underflowed). If the file argument is present, it specifies -the name of the time trace file; otherwise time traces read from standard -input. If --verbose is specified, then the program outputs a new time trace -where adjacent allocate/free entries have been collapsed into a single entry -for ease of reading. Otherwise it just prints statistics about the cost -of allocation and deallocation -Usage: ttmlxalloc.py [--verbose] [file] -""" - -from __future__ import division, print_function -from glob import glob -from optparse import OptionParser -import math -import os -import re -import string -import sys - -verbose = False -f = sys.stdin - -# A dictionary where keys are core ids, and each value is the number -# of consecutive time trace entries for that core that are for page -# allocations/frees. -num_allocs = {} -num_frees = {} - -# A dictionary where keys are core ids, and each value is the time of -# the first page allocation/free entry in the current batch for that core. -first_alloc_time = {} -first_free_time = {} - -# A dictionary where keys are core ids, and each value is the time of -# the most recent page allocation/free for that core. -last_alloc_time = {} -last_free_time = {} - -# Time of previous time trace record that was printed. -prev_time = 0 - -# Each entry in this list is a count of the number of pages allocated/freed -# in one batch. -alloc_counts = [] -free_counts = [] - -# Each entry in this list is the time consumed by a single batch of page -# allocations/frees. -alloc_times = [] -free_times = [] - -# Dictionary whose keys are the ids of all the distinct RPCs seen in -# the trace. -ids = {} - -if (len(sys.argv) == 2) and (sys.argv[1] == "--help"): - print("Usage: %s [--stats] [file]" % (sys.argv[0])) - sys.exit(0) -if (len(sys.argv) >= 2) and (sys.argv[1] == "--verbose"): - verbose = True - sys.argv.pop(1) -if len(sys.argv) >= 2: - f = open(sys.argv[1]) - -for line in f: - match = re.match(' *([0-9.]+) us .* \[(C[0-9]+)\] (.*)', line) - if not match: - if verbose: - print(line) - continue - time = float(match.group(1)) - core = match.group(2) - msg = match.group(3) - if not core in num_allocs: - num_allocs[core] = 0 - num_frees[core] = 0 - match = re.match('.*id ([0-9.]+)', msg) - if match: - ids[match.group(1)] = 1 - - if 'mlx starting page alloc' in msg: - if num_allocs[core] == 0: - first_alloc_time[core] = time - last_alloc_time[core] = time - num_allocs[core] += 1 - continue - - if 'mlx starting page release' in msg: - if num_frees[core] == 0: - first_free_time[core] = time - last_free_time[core] = time - num_frees[core] += 1 - continue - - if num_allocs[core] != 0: - if verbose: - print("%9.3f us (+%8.3f us) [%s] mlx allocated %d pages (%.1f us)" % ( - time, time - prev_time, core, num_allocs[core], - last_alloc_time[core] - first_alloc_time[core])) - alloc_counts.append(num_allocs[core]) - alloc_times.append(last_alloc_time[core] - first_alloc_time[core]) - num_allocs[core] = 0 - prev_time = time - - if num_frees[core] != 0: - if verbose: - print("%9.3f us (+%8.3f us) [%s] mlx freed %d pages (%.1f us)" % ( - time, time - prev_time, core, num_frees[core], - last_free_time[core] - first_free_time[core])) - free_counts.append(num_frees[core]) - free_times.append(last_free_time[core] - first_free_time[core]) - num_frees[core] = 0 - prev_time = time - - if verbose: - print("%9.3f us (+%8.3f us) [%s] %s" % (time, time - prev_time, core, msg)) - prev_time = time - -if verbose: - sys.exit(0) - -print("Total number of RPCs: %6d" % (len(ids))) -print("Total elapsed time: %8.1f us" % (prev_time)) -print("") -if len(alloc_counts) == 0: - print("No page allocations") -else: - print("Page allocations:") - print(" Total pages: %6d" % (sum(alloc_counts))) - print(" Batches: %5d" % (len(alloc_counts))) - print(" Average batch size: %7.1f" % (sum(alloc_counts)/len(alloc_counts))) - print(" Average batch time: %7.1f us" % (sum(alloc_times)/len(alloc_counts))) - print(" Alloc time per RPC: %7.1f us" % (sum(alloc_times)/len(ids))) - print(" Total time: %7.1f us (%.3f core)" % (sum(alloc_times), - sum(alloc_times)/prev_time)) -if len(free_counts) == 0: - print("No page frees") -else: - print("Page frees:") - print(" Total pages: %6d" % (sum(free_counts))) - print(" Batches: %5d" % (len(free_counts))) - print(" Average batch size: %7.1f" % (sum(free_counts)/len(free_counts))) - print(" Average batch time: %7.1f us" % (sum(free_times)/len(free_counts))) - print(" Free time per RPC: %7.1f us" % (sum(free_times)/len(ids))) - print(" Total time: %7.1f us (%.3f core)" % (sum(free_times), - sum(free_times)/prev_time)) \ No newline at end of file diff --git a/util/ttnicdelay.py b/util/ttnicdelay.py deleted file mode 100755 index 0ba6b1bd..00000000 --- a/util/ttnicdelay.py +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/python3 - -""" -This program looks for evidence suggesting that NICs are configured to -delay interrupts. It scans two timetraces for the same time interval, one -from a client and one from a server, looking for situations where the -server experiences a significant gap between two consecutive clients -even though the client transmitted them back-to-back. - -Usage: ttgap.py [--verbose] [client [server]] - -The "client" and "server" arguments give the names of the two timetrace -files; they default to client.tt and server.tt. One way to collect these -traces is by running "cp_node client --one-way --workload 500000" on the -client. -""" - -from __future__ import division, print_function -from glob import glob -from optparse import OptionParser -import math -import os -import re -import string -import sys -from statistics import median - -client_trace = "client.tt" -server_trace = "server.tt" -verbose = False -if (len(sys.argv) >= 2) and (sys.argv[1] == "--help"): - print("Usage: %s [--verbose] [client_trace [server_trace]]" % (sys.argv[0])) - sys.exit(0) -if (len(sys.argv) >= 2) and (sys.argv[1] == "--verbose"): - verbose = True - sys.argv.pop(1) -if len(sys.argv) >= 2: - client_trace = sys.argv[1] - sys.argv.pop(1) -if len(sys.argv) >= 2: - server_trace = sys.argv[1] - -# Information about each data packet sent by the client: the key has the -# form "id:offset", identifying a particular data packet. The value is -# a list of where time is the time when the packet was sent -# and gap is the elapsed time since the previous packet was sent. -client_packets = {} - -last_xmit = 0.0 -total_xmit_gap = 0.0 - -for line in open(client_trace): - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'Finished queueing packet: rpc id ([0-9]+), offset ([0-9]+)', line) - if match: - time = float(match.group(1)) - id = match.group(4) - offset = match.group(5) - key = id + ":" + offset - gap = time-last_xmit - if 0: - print("%9.3f: xmit %s, gap %.1f" % (time, key, gap)) - if (offset != "0") and (gap > 10.0): - total_xmit_gap += gap - if last_xmit > 0: - client_packets[id + ":" + offset] = [time, gap] - last_xmit = time - -last_recv = 0.0 -total_gap = 0.0 -num_gaps = 0 -num_pkts = 0 -last_gap_pkt = 0 -gap_offsets = [] - -for line in open(server_trace): - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'homa_gro_receive got packet .* id ([0-9]+), offset ([0-9]+)', line) - if match: - time = float(match.group(1)) - id = int(match.group(4)) - offset = match.group(5) - key = "%d:%s" % (id-1, offset) - gap = time - last_recv - last_recv = time - if (offset != "0") and (key in client_packets): - num_pkts += 1 - client_time, client_gap = client_packets[key] - if (gap > 20) and (client_gap < 5): - if verbose: - print("%9.3f: recv %s, gap %.1f, xmit_gap %.1f " - "(sent at %.3f), pkts since last gap %d" % ( - time, key, gap, client_gap, client_time, - num_pkts - last_gap_pkt)) - num_gaps += 1 - total_gap += gap - client_gap - last_gap_pkt = num_pkts - gap_offsets.append(int(offset)) - -print("%d unexpected gaps over %d packets" % (num_gaps, num_pkts)) -print("Total recv gap %.1f us (%.1f%% of elapsed time)" % (total_gap, - 100.0*total_gap/last_xmit)) -print("Average interval between gaps: %.1f packets" % (num_pkts/num_gaps)) -print("Average gap length: %.1f us" % (total_gap/num_gaps)) - -if verbose: - print("Total xmit gap %.1fus (%.1f%% of elapsed time)" % (total_xmit_gap, - 100.0*total_xmit_gap/last_xmit)) - -if 0: - gap_offsets = sorted(gap_offsets) - cur_offset = -1 - count = 0 - for offset in gap_offsets: - if offset != cur_offset: - if cur_offset >= 0: - print("%6d %d" % (cur_offset, count)) - cur_offset = offset - count = 0 - count += 1 - print("%6d %d" % (cur_offset, count)) \ No newline at end of file diff --git a/util/ttoffset.py b/util/ttoffset.py index 69627bd2..017140b4 100755 --- a/util/ttoffset.py +++ b/util/ttoffset.py @@ -1,18 +1,7 @@ #!/usr/bin/python3 -# Copyright (c) 2019-2022 Stanford University -# -# Permission to use, copy, modify, and distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2019-2022 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ Rewrite a time trace with all of the times offset by a fixed amount @@ -44,9 +33,9 @@ delta = float(sys.argv[2]) - float(sys.argv[1]) for line in f: - match = re.match(' *([0-9.]+) us (.*)', line) + match = re.match(' *([-0-9.]+) us (.*)', line) if not match: - print(line) + print(line, end='') continue time = float(match.group(1)) print("%9.3f us %s" % (time + delta, match.group(2))) \ No newline at end of file diff --git a/util/ttpktdelay.py b/util/ttpktdelay.py deleted file mode 100755 index 64f485f5..00000000 --- a/util/ttpktdelay.py +++ /dev/null @@ -1,679 +0,0 @@ -#!/usr/bin/python3 - -""" -Scans two timetraces for the same time interval, one from a client and one -from a server, to analyze packet delays in both directions. - -Usage: ttpktdelay.py [--verbose] [client [server]] - -The "client" and "server" arguments give the names of the two timetrace -files; they default to client.tt and server.tt. -""" - -from __future__ import division, print_function -from glob import glob -from optparse import OptionParser -import math -import os -import re -import string -import sys -from statistics import median - -client_trace = "client.tt" -server_trace = "server.tt" -verbose = False -if (len(sys.argv) >= 2) and (sys.argv[1] == "--help"): - print("Usage: %s [--verbose] [client_trace [server_trace]]" % (sys.argv[0])) - sys.exit(0) -if (len(sys.argv) >= 2) and (sys.argv[1] == "--verbose"): - verbose = True - sys.argv.pop(1) -if len(sys.argv) >= 2: - client_trace = sys.argv[1] - sys.argv.pop(1) -if len(sys.argv) >= 2: - server_trace = sys.argv[1] - -def percentile(list, pct, format): - """ - Finds the element of list corresponding to a given percentile pct - (0 is first, 100 or more is last), formats it according to format, - and returns the result. Returns "N/A" if the list is empty. - """ - if len(list) == 0: - return "N/A" - i = int(pct*len(list)/100) - if i >= len(list): - i = len(list) - 1 - return format % (list[i]) - -def percentile2(list, pct, format): - """ - Finds the element of list corresponding to a given percentile pct - (0 is first, 100 or more is last), treats the element a list, - formats the first element of that list according to format, - and returns the result. Returns "N/A" if the list is empty. - """ - if len(list) == 0: - return "N/A" - i = int(pct*len(list)/100) - if i >= len(list): - i = len(list) - 1 - return format % (list[i][0]) - -def dict_diffs(dict1, dict2, msg=None): - """ - Return a list consisting of the differences between elements in - dict2 and those in dict1 with matching keys (ignore elements that - appear in only one dict). If msg is specified, then negative - differences should be ignored and an error message should be printed; - msg provides info about the dictionaries being diffed. - """ - diffs = [] - for key in dict1: - if key in dict2: - if msg and dict2[key] < dict1[key]: - print("Skipping out of order diff for %s, id %s: %9.3f " - "< %9.3f" % (msg, key, dict2[key], dict1[key])) - else: - diffs.append(dict2[key] - dict1[key]) - return diffs - -def print_samples(event1, event2, offset, delays, pct, msg, num_samples): - """ - Print identifying information about events that fall at or near a given - percentile (from smallest to largest) among a collection of delays - event1: information about first event (dictionary mapping pktid -> time) - event2: information about a later event - offset: clock offset between times in event1 and those in event2 - delays: sorted list of delays computed from event1 to event2 - pct: desired percentile - msg: human-readable text describing the interval - num_samples: number of events to print - """ - - if len(delays) == 0: - print("No delays available for %s" % (msg)) - return - target = delays[pct*len(delays)//100] - samples = [] - for pktid in event1: - if not pktid in event2: - continue - elapsed = event2[pktid] - event1[pktid] - offset - samples.append({'time': event2[pktid], 'pktid': pktid, - 'delay': elapsed}) - - # Sort samples by how close their delay is to the desired one. - samples = sorted(samples, key=lambda sample : abs(target - sample['delay'])) - - # Now select the best samples without duplicating times - chosen = [] - for sample in samples: - for choice in chosen: - if abs(choice['time'] - sample['time']) < 100: - sample = None - break - if sample != None: - chosen.append(sample) - if len(chosen) == num_samples: - break - - if len(chosen) == 0: - print("Couldn't find %dth percentile events for %s" % (pct, msg)) - return - print("%3dth percentile events for %s:" % (pct, msg)) - for sample in chosen: - print(" %9.3f %-22s %.1f us" % (sample['time'], - "(pktid %s):" % (sample['pktid']), sample['delay'])) - -def print_samples2(events, pct, msg, fmt, num_samples): - """ - Similar to print_sample, except that the data is passed in a single - list. Prints info about the event that in events that falls at a given - percentile (from smallest to largest) - events: list of tuples, where value is the data on - which we're computing percentile, and time is the time - in the trace when the tuple was logged. The list is sorted - in order of the values - pct: desired percentile - msg: human-readable text describing the values - fmt: printf-style format string for printing a value - num_samples: number of events to print - """ - - if len(events) == 0: - print("No events available for %s" % (msg)) - return - target = events[pct*len(events)//100][0] - - # Sort sample by how close their value is to the target value - resorted = sorted(events, key = lambda event: abs(target - event[0])) - - # Now select the best samples without duplicating times - chosen = [] - for sample in resorted: - for choice in chosen: - if abs(choice[1] - sample[1]) < 100: - sample = None - break - if sample != None: - chosen.append(sample) - if len(chosen) == num_samples: - break - - # Print out the chosen samples - if len(chosen) == 0: - print("Couldn't find %dth percentile events for %s" % (pct, msg)) - return - print("%3dth percentile events for %s:" % (pct, msg)) - for sample in chosen: - print(" %9.3f %s" % (sample[1], fmt % (sample[0]))) - -def parse_tt(tt, server): - """ - Reads the timetrace file given by tt and returns a dictionary containing - extracted statistics (see below) The server argument indicates whether - this is a server trace; if so, 1 gets subtracted from all RPC ids to - produce client ids. - - The return value from parse_tt is a dictionary with the following elements: - - Each of the following elements is itself a dictionary, where keys are - packet ids (rpc_id:offset) and values are times when that packet id - reached the given point of processing. rpc_id's are client-side ids, - even for server info. - data_send: time when ip_queue_xmit was called for a data packet - data_mlx: time when mlx driver finished sending a data packet - data_gro: time when this packet was processed by homa_gro_receive - data_gro_last: time when last packet in batch containing this packet - was processed by homa_gro_receive - data_handoff: time when SoftIRQ handoff was issued for batch - containing this packet - data_softirq_start: time when homa_softirq was invoked with batch that - includes this packet - data_softirq: time when this homa_data_pkt processed this packet - at SoftIRQ level - - Each of the following elements is a list of 2-element lists, of which - the second element is the time at which the last relevant event occurred - delays_before_napi: elements are lists, where usecs is the - delay between common_interrupt and mlx5e_napi_poll - gro_times: elements are lists, where usecs is the - the delay between the invocation of mlx5e_napi_poll - and the last call to homa_gro_receive for a batch of - packets - gro_counts: elements are lists, where count is the - number of packets processed by homa_gro_receive in a - batch - gro_gaps: elements are lists, where usecs is the - time between the last call to homa_gro_receive and - when the handoff to SoftIRQ was made - """ - - global verbose - - data_send = {} - data_mlx = {} - data_gro = {} - data_gro_last = {} - data_handoff = {} - data_softirq_start = {} - data_softirq = {} - - grant_send = {} - grant_mlx = {} - grant_gro = {} - grant_gro_last = {} - grant_handoff = {} - grant_softirq_start = {} - grant_softirq = {} - - delays_before_napi = [] - gro_times = [] - gro_counts = [] - gro_gaps = [] - - # Keys are RPC ids and core; each value is the most recent pktid for which - # ip_queue_xmit was invoked for this RPC on this core. - grant_ids = {} - - # Keys are cores; each value is a list of packet ids that need - # handoff events for this core - data_handoff_ids = {} - grant_handoff_ids = {} - - # Keys are cores; each value is the most recent time when homa_softirq - # was invoked on the core - softirq_start = {} - - # Keys are cores; each value is the most recent time when Homa GRO - # processed a packet on that core. - last_gro = {} - - # Keys are cores; each value is the number of packets processed by - # homa_gro_receive in the current batch. - num_gro_packets = {} - - # Keys are cores; each value is the most recent time when mlx_5e_napi_poll - # was invoked on that core. - last_mlx_napi = {} - - # Keys are cores; each value is the most recent time when the lowest - # level interrupt handler (common_interrupt) was invoked. - last_irq = {} - - # Counts of number of records of each type; used to detect when - # changes in the timetrace code break these statistics. - counts = { - "sent packet": 0, - "napi_poll invoked": 0, - "backlog enqueue": 0, - "softirq start": 0, - "interrupt start": 0, - "ip_queue_xmit": 0, - "gro_receive": 0, - "softirq data pkt": 0, - "sent grant": 0, - "gro_receive got grant": 0, - "grant processed": 0, - } - - for line in open(tt): - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\]' - '.* id ([-0-9.]+),.* offset ([-0-9.]+)', line) - if not match: - # mlx finished sending grant - match = re.match(' *([-0-9.]+) us .* \[C([0-9]+)\] mlx ' - 'sent homa packet to .* id ([-0-9.]+), type 21', line) - if match: - counts["sent packet"] += 1 - time = float(match.group(1)) - core = int(match.group(2)) - id = match.group(3) - if (server): - id = str(int(id) - 1) - key = id + ":" + str(core) - if key in grant_ids: - grant_mlx[grant_ids[key]] = time - - # NAPI handler on receiver - match = re.match(' *([-0-9.]+) us .* \[C([0-9]+)\] ' - 'mlx5e_napi_poll invoked', line) - if match: - counts["napi_poll invoked"] += 1 - time = float(match.group(1)) - core = int(match.group(2)) - last_mlx_napi[core] = time - if core in last_irq: - delays_before_napi.append([time - last_irq[core], time]) - - # Batch of packets has been handed off to SoftIRQ - match = re.match(' *([-0-9.]+) us .* \[C([0-9]+)\] ' - 'enqueue_to_backlog', line) - if match: - counts["backlog enqueue"] += 1 - time = float(match.group(1)) - core = int(match.group(2)) - if core in last_gro: - gro_gaps.append([time - last_gro[core], time]) - if core in last_mlx_napi: - gro_times.append([last_gro[core] - last_mlx_napi[core], - last_mlx_napi[core]]) - if core in num_gro_packets: - gro_counts.append([num_gro_packets[core], time]) - num_gro_packets[core] = 0 - if core in data_handoff_ids: - for pktid in data_handoff_ids[core]: - data_gro_last[pktid] = last_gro[core] - data_handoff[pktid] = time - if core in grant_handoff_ids: - for pktid in grant_handoff_ids[core]: - grant_gro_last[pktid] = last_gro[core] - grant_handoff[pktid] = time - data_handoff_ids[core] = [] - grant_handoff_ids[core] = [] - - # homa_softirq invocation time - match = re.match(' *([-0-9.]+) us .* \[C([0-9]+)\] ' - 'homa_softirq: first packet', line) - if match: - counts["softirq start"] += 1 - time = float(match.group(1)) - core = int(match.group(2)) - softirq_start[core] = time - - # common_interrupt invocation time - match = re.match(' *([-0-9.]+) us .* \[C([0-9]+)\] ' - 'irq common_interrupt starting', line) - if match: - counts["interrupt start"] += 1 - time = float(match.group(1)) - core = int(match.group(2)) - last_irq[core] = time - - continue - - time = float(match.group(1)) - core = int(match.group(3)) - id = match.group(4) - if (server): - id = str(int(id) - 1) - offset = match.group(5) - pktid = id + ":" + offset - - # Outgoing data sent - if re.match('.*calling .*_xmit: wire_bytes', line): - counts["ip_queue_xmit"] += 1 - data_send[pktid] = time - - # Data packet passed to NIC - if "Finished queueing packet" in line: - counts["ip_queue_xmit"] += 1 - data_mlx[pktid] = time - - # Incoming data packet processed by Homa GRO - if "homa_gro_receive got packet" in line: - counts["gro_receive"] += 1 - data_gro[pktid] = time - last_gro[core] = time - if not core in num_gro_packets: - num_gro_packets[core] = 0 - num_gro_packets[core] += 1 - if not core in data_handoff_ids: - data_handoff_ids[core] = [] - data_handoff_ids[core].append(pktid) - - # Incoming data (SoftIRQ level) - if "incoming data packet, id" in line: - counts["softirq data pkt"] += 1 - if core in softirq_start: - data_softirq[pktid] = time - data_softirq_start[pktid] = softirq_start[core] - - # Outgoing grant - if "sending grant for id" in line: - counts["sent grant"] += 1 - grant_send[pktid] = time - key = id + ":" + str(core) - grant_ids[key] = pktid - - # Incoming grant processed by Homa GRO - if "homa_gro_receive got grant" in line: - counts["gro_receive got grant"] += 1 - grant_gro[pktid] = time - last_gro[core] = time - if not core in grant_handoff_ids: - grant_handoff_ids[core] = [] - grant_handoff_ids[core].append(pktid) - - # Incoming grant (SoftIRQ level) - if "processing grant for id" in line: - counts["grant processed"] += 1 - if core in softirq_start: - grant_softirq[pktid] = time - grant_softirq_start[pktid] = softirq_start[core] - - if verbose: - if server: - print("Record counts in server log:") - else: - print("Record counts in client log:") - for id in counts: - print(" %-24s %6d" % (id, counts[id])) - - return { - 'data_send': data_send, - 'data_mlx': data_mlx, - 'data_gro': data_gro, - 'data_gro_last': data_gro_last, - 'data_handoff': data_handoff, - 'data_softirq_start': data_softirq_start, - 'data_softirq': data_softirq, - - 'grant_send': grant_send, - 'grant_mlx': grant_mlx, - 'grant_gro': grant_gro, - 'grant_gro_last': grant_gro_last, - 'grant_handoff': grant_handoff, - 'grant_softirq_start': grant_softirq_start, - 'grant_softirq': grant_softirq, - - 'delays_before_napi': delays_before_napi, - 'gro_times': gro_times, - 'gro_counts': gro_counts, - 'gro_gaps': gro_gaps, - } - -client = parse_tt(client_trace, False) -server = parse_tt(server_trace, True) - -# Now combine the data from the two time traces to compute interesting delays - -# Delays for data packets and grants passing through the IP stack -# on a single machine. -client_data_xmit = sorted(dict_diffs(client['data_send'], client['data_mlx'], - "client data_send -> data_mlx")) -client_grant_xmit = sorted(dict_diffs(client['grant_send'], client['grant_mlx'], - "client grant_send -> grant_mlx")) -server_data_xmit = sorted(dict_diffs(server['data_send'], server['data_mlx'], - "server data_send -> data_mlx")) -server_grant_xmit = sorted(dict_diffs(server['grant_send'], server['grant_mlx'], - "server grant_send -> grant_mlx")) - -# Delays for data packets and grants from NIC on one machine to start of -# NAPI-level process on the other. These differences have not been compensated -# for clock differences between the machines. -cs_data_net = sorted(dict_diffs(client['data_mlx'], server['data_gro'])) -cs_grant_net = sorted(dict_diffs(client['grant_mlx'], server['grant_gro'])) -sc_data_net = sorted(dict_diffs(server['data_send'], client['data_gro'])) -sc_grant_net = sorted(dict_diffs(server['grant_send'], client['grant_gro'])) - -# Additional GRO processing after this packet (other packets in batch) -client_data_gro_last = sorted(dict_diffs(client['data_gro'], - client['data_gro_last'], "client data_gro -> data_gro_last")) -client_grant_gro_last = sorted(dict_diffs(client['grant_gro'], - client['grant_gro_last'], "client grant_gro -> grant_gro_last")) -server_data_gro_last = sorted(dict_diffs(server['data_gro'], - server['data_gro_last'], "server data_gro -> data_gro_last")) -server_grant_gro_last = sorted(dict_diffs(server['grant_gro'], - server['grant_gro_last'], "server grant_gro -> grant_gro_last")) - -# Delays from last GRO packet to SoftIRQ handoff -client_data_handoff = sorted(dict_diffs(client['data_gro_last'], - client['data_handoff'], "client data_gro_last -> data_handoff")) -client_grant_handoff = sorted(dict_diffs(client['grant_gro_last'], - client['grant_handoff'], "client grant_gro_last -> grant_handoff")) -server_data_handoff = sorted(dict_diffs(server['data_gro_last'], - server['data_handoff'], "server data_gro_last -> data_handoff")) -server_grant_handoff = sorted(dict_diffs(server['grant_gro_last'], - server['grant_handoff'], "server grant_gro_last -> grant_handoff")) - -# Delays from SoftIRQ handoff until homa_softirq starts -client_data_softirq_start = sorted(dict_diffs(client['data_handoff'], - client['data_softirq_start'], "client data_handoff -> softirq_start")) -client_grant_softirq_start = sorted(dict_diffs(client['grant_handoff'], - client['grant_softirq_start'], "client grant_handoff -> softirq_start")) -server_data_softirq_start = sorted(dict_diffs(server['data_handoff'], - server['data_softirq_start'], "server data_handoff -> softirq_start")) -server_grant_softirq_start = sorted(dict_diffs(server['grant_handoff'], - server['grant_softirq_start'], "server grant_handoff -> softirq_start")) - -# Delays from SoftIRQ start until the desired packet is processed -client_data_softirq = sorted(dict_diffs(client['data_softirq_start'], - client['data_softirq'], "client data_softirq_start -> data_softirq")) -client_grant_softirq = sorted(dict_diffs(client['grant_softirq_start'], - client['grant_softirq'], "client grant_softirq_start -> grant_softirq")) -server_data_softirq = sorted(dict_diffs(server['data_softirq_start'], - server['data_softirq'], "server data_softirq_start -> data_softirq")) -server_grant_softirq = sorted(dict_diffs(server['grant_softirq_start'], - server['grant_softirq'], "server grant_softirq_start -> grant_softirq")) - -# Total delays (ip_queue_xmit to SoftIRQ) -cs_data_total = sorted(dict_diffs(client['data_send'], server['data_softirq'])) -sc_data_total = sorted(dict_diffs(server['data_send'], client['data_softirq'])) -cs_grant_total = sorted(dict_diffs(client['grant_send'], server['grant_softirq'])) -sc_grant_total = sorted(dict_diffs(server['grant_send'], client['grant_softirq'])) - -# Compute minimum RTT and server clock offset -if len(cs_data_net) == 0: - print("No data in cs_data_net") - exit(1) -if len(sc_data_net) == 0: - print("No data in sc_data_net") - exit(1) -rtt = cs_data_net[0] + sc_data_net[0] -clock_offset = cs_data_net[0] - rtt/2 -print("Minimum Network RTT: %.1f us, clock offset %.1f us" % (rtt, clock_offset)) - -# Adjust cross-machine times to reflect clock offset. -for list in [cs_data_net, cs_grant_net, cs_data_total, cs_grant_total]: - for i in range(len(list)): - list[i] -= clock_offset -for list in [sc_data_net, sc_grant_net, sc_data_total, sc_grant_total]: - for i in range(len(list)): - list[i] += clock_offset - -percents = [0, 10, 30, 50, 70, 90, 99, 100] - -print("\nIP: IP stack, from calling ip_queue_xmit to NIC wakeup") -print("Net: Additional time until homa_gro_receive gets packet") -print("GRO Other: Time until end of GRO batch") -print("GRO Gap: Delay after GRO packet processing until SoftIRQ handoff") -print("Wakeup: Delay until homa_softirq starts") -print("SoftIRQ: Time in homa_softirq until packet is processed") -print("Total: End-to-end time from calling ip_queue_xmit to homa_softirq") -print(" handler for packet") - -print("\nData packet lifetime (us), client -> server:") -print("Pctile IP Net GRO Other GRO Gap Wakeup SoftIRQ Total") -for p in percents: - print("%3d %6s %6s %6s %6s %6s %6s %6s" % (p, - percentile(client_data_xmit, p, "%.1f"), - percentile(cs_data_net, p, "%.1f"), - percentile(server_data_gro_last, p, "%.1f"), - percentile(server_data_handoff, p, "%.1f"), - percentile(server_data_softirq_start, p, "%.1f"), - percentile(server_data_softirq, p, "%.1f"), - percentile(cs_data_total, p, "%.1f"))) - -print("\nData packet lifetime (us), server -> client:") -print("Pctile IP Net GRO Other GRO Gap Wakeup SoftIRQ Total") -for p in percents: - print("%3d %6s %6s %6s %6s %6s %6s %6s" % (p, - percentile(server_data_xmit, p, "%.1f"), - percentile(sc_data_net, p, "%.1f"), - percentile(client_data_gro_last, p, "%.1f"), - percentile(client_data_handoff, p, "%.1f"), - percentile(client_data_softirq_start, p, "%.1f"), - percentile(client_data_softirq, p, "%.1f"), - percentile(sc_data_total, p, "%.1f"))) - -print("\nGrant lifetime (us), client -> server:") -print("Pctile IP Net GRO Other GRO Gap Wakeup SoftIRQ Total") -for p in percents: - print("%3d %6s %6s %6s %6s %6s %6s %6s" % (p, - percentile(client_grant_xmit, p, "%.1f"), - percentile(cs_grant_net, p, "%.1f"), - percentile(server_grant_gro_last, p, "%.1f"), - percentile(server_grant_handoff, p, "%.1f"), - percentile(server_grant_softirq_start, p, "%.1f"), - percentile(server_grant_softirq, p, "%.1f"), - percentile(cs_grant_total, p, "%.1f"))) - -print("\nGrant lifetime (us), server -> client:") -print("Pctile IP Net GRO Other GRO Gap Wakeup SoftIRQ Total") -for p in percents: - print("%3d %6s %6s %6s %6s %6s %6s %6s" % (p, - percentile(server_grant_xmit, p, "%.1f"), - percentile(sc_grant_net, p, "%.1f"), - percentile(client_grant_gro_last, p, "%.1f"), - percentile(client_grant_handoff, p, "%.1f"), - percentile(client_grant_softirq_start, p, "%.1f"), - percentile(client_grant_softirq, p, "%.1f"), - percentile(sc_grant_total, p, "%.1f"))) - -print("\nAdditional client-side statistics:") -print("Pre NAPI: usecs from interrupt entry to NAPI handler") -print("GRO Total: usecs from NAPI handler entry to last homa_gro_receive") -print("Batch: number of packets processed in one interrupt") -print("Gap: usecs from last homa_gro_receive call to SoftIRQ handoff") -delays_before_napi = sorted(client['delays_before_napi'], - key=lambda tuple : tuple[0]) -gro_times = sorted(client['gro_times'], key=lambda tuple : tuple[0]) -gro_counts = sorted(client['gro_counts'], key=lambda tuple : tuple[0]) -gro_gaps = sorted(client['gro_gaps'], key=lambda tuple : tuple[0]) -print("\nPctile Pre NAPI GRO Batch Gap") -for p in percents: - print("%3d %6s %6s %5s %6s" % (p, - percentile2(delays_before_napi, p, "%.1f"), - percentile2(gro_times, p, "%.1f"), - percentile2(gro_counts, p, "%d"), - percentile2(gro_gaps, p, "%.1f"))) - -print("\nSame stats for server:") -delays_before_napi = sorted(server['delays_before_napi'], - key=lambda tuple : tuple[0]) -gro_times = sorted(server['gro_times'], key=lambda tuple : tuple[0]) -gro_counts = sorted(server['gro_counts'], key=lambda tuple : tuple[0]) -gro_gaps = sorted(server['gro_gaps'], key=lambda tuple : tuple[0]) -print("Pctile Pre NAPI GRO Batch Gap") -for p in percents: - print("%3d %6s %6s %5s %6s" % (p, - percentile2(delays_before_napi, p, "%.1f"), - percentile2(gro_times, p, "%.1f"), - percentile2(gro_counts, p, "%d"), - percentile2(gro_gaps, p, "%.1f"))) - -num_samples = 5 -if verbose: - print("\nPotentially interesting events:") - print_samples(client['data_mlx'], server['data_gro'], clock_offset, - cs_data_net, 0, "Net (client->server data)", num_samples) - print_samples(client['data_mlx'], server['data_gro'], clock_offset, - cs_data_net, 90, "Net (client->server data)", num_samples) - print_samples(client['data_mlx'], server['data_gro'], clock_offset, - cs_data_net, 99, "Net (client->server data)", num_samples) - print_samples(server['data_gro'], server['data_gro_last'], 0, - server_data_gro_last, 90, "GRO Other (client->server data)", - num_samples) - print_samples(server['data_gro'], server['data_gro_last'], 0, - server_data_gro_last, 99, "GRO Other (client->server data)", - num_samples) - print_samples(server['data_gro_last'], server['data_handoff'], 0, - server_data_handoff, 90, "GRO Gap (client->server data)", - num_samples) - print_samples(server['data_gro_last'], server['data_handoff'], 0, - server_data_handoff, 99, "GRO Gap (client->server data)", - num_samples) - print_samples(server['data_handoff'], server['data_softirq_start'], 0, - server_data_softirq_start, 90, "Wakeup (client->server data)", - num_samples) - print_samples(server['data_handoff'], server['data_softirq_start'], 0, - server_data_softirq_start, 99, "Wakeup (client->server data)", - num_samples) - print_samples(server['data_softirq_start'], server['data_softirq'], 0, - server_data_softirq, 90, "SoftIRQ (client->server data)", - num_samples) - print_samples(server['data_softirq_start'], server['data_softirq'], 0, - server_data_softirq, 99, "SoftIRQ (client->server data)", - num_samples) - - print() - print_samples2(delays_before_napi, 90, "delay before NAPI starts (server)", - "%.1f us", num_samples) - print_samples2(delays_before_napi, 99, "delay before NAPI starts (server)", - "%.1f us", num_samples) - print_samples2(gro_times, 90, "total time for GRO batch (server)", - "%.1f us", num_samples) - print_samples2(gro_times, 99, "total time for GRO batch (server)", - "%.1f us", num_samples) - print_samples2(gro_counts, 90, "packets in a GRO batch (server)", - "%d", num_samples) - print_samples2(gro_counts, 99, "packets in a GRO batch (server)", - "%d", num_samples) - print_samples2(gro_gaps, 90, "gap before SoftIRQ wakeup (server)", - "%.1f us", num_samples) - print_samples2(gro_gaps, 99, "gap before SoftIRQ wakeup (server)", - "%.1f us", num_samples); \ No newline at end of file diff --git a/util/ttprint.py b/util/ttprint.py index f54467d3..47d7240b 100755 --- a/util/ttprint.py +++ b/util/ttprint.py @@ -1,23 +1,12 @@ #!/usr/bin/python3 -# Copyright (c) 2019-2022 Stanford University -# -# Permission to use, copy, modify, and distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2019-2022 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ This program reads timetrace information from /proc/timetrace (or from the first argument, if given) and prints it out in a different form, -with times in nanoseconds instead of clock cycles. +with times in microseconds instead of clock cycles. """ from __future__ import division, print_function diff --git a/util/ttrange.py b/util/ttrange.py index 9cae919e..139f0ea5 100755 --- a/util/ttrange.py +++ b/util/ttrange.py @@ -1,18 +1,7 @@ #!/usr/bin/python3 -# Copyright (c) 2019-2022 Stanford University -# -# Permission to use, copy, modify, and distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2019-2022 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ Extract entries from a timetrace that For any particular time range. diff --git a/util/ttrpcs.py b/util/ttrpcs.py deleted file mode 100755 index ab205647..00000000 --- a/util/ttrpcs.py +++ /dev/null @@ -1,375 +0,0 @@ -#!/usr/bin/python3 - -""" -Scans a client or server timetrace to compute the time it takes for each -phase of RPCs. -Usage: ttrpcs.py [--server] [tt_file] - -The existing timetrace is in tt_file (or stdin in tt_file is omitted). -""" - -from __future__ import division, print_function -from glob import glob -from optparse import OptionParser -import math -import os -import re -import string -import sys -from statistics import median - -# Lists of patterns (one for the client side and one for the server side). -# We'll record times for each RPC id when it hits each pattern. The list -# should be in order of occurrence within an RPC. Fields that may exist -# in each pattern: -# pattern: the regex pattern to match against each timetrace record -# name: human-readable string to use in printout -# record_last: if this field exists and if there are multiple records -# matching the pattern, the time of the last will be recorded; -# otherwise only the first will be recorded -# first_out: this is the first data packet sent for the RPC -# out_packet: data packet sent with offset != 0 -# first_in: this is the first data packet received for the RPC -# in_packet: data packet received with offset != 0 -client_patterns = [ - {"pattern": "homa_sendmsg request.* id ([0-9]+)", - "name": "start"}, - {"pattern":"Finished queueing packet.* id ([0-9]+), offset 0", - "name": "first request packet sent", - "first_out": True}, - {"pattern":"processing grant .* id ([0-9]+)", - "name": "softirq gets first grant"}, - {"pattern":"Finished queueing packet.* id ([0-9]+), offset ([0-9]+)", - "name": "last request packet sent", - "record_last": True, - "out_packet": True}, - {"pattern":"homa_gro_receive got packet .* id ([0-9]+), offset 0", - "name": "gro gets first response packet", - "first_in": True}, - {"pattern":"sending grant for id ([0-9]+)", - "name": "sent grant"}, - {"pattern":"homa_gro_receive got packet .* id ([0-9]+), offset ([0-9]+)", - "name": "gro gets last response packet", - "record_last": True, - "in_packet": True}, - {"pattern":"homa_recvmsg returning id ([0-9]+), length ([0-9]+)", - "name": "homa_recvmsg returning"}, -] - -server_patterns = [ - {"pattern":"homa_gro_receive got packet .* id ([0-9]+), offset 0", - "name": "gro gets first request packet", - "first_in": True}, - {"pattern":"sending grant for id ([0-9]+)", - "name": "sent grant"}, - {"pattern":"homa_gro_receive got packet .* id ([0-9]+), offset ([0-9]+)", - "name": "gro gets last request packet", - "record_last": True, - "in_packet": True}, - {"pattern":"homa_recvmsg returning id ([0-9]+), length ([0-9]+)", - "name": "homa_recvmsg returning"}, - {"pattern":"homa_sendmsg response,* id ([0-9]+)", - "name": "homa_sendmsg response"}, - {"pattern":"Finished queueing packet.* id ([0-9]+), offset 0", - "name": "first response packet sent", - "first_out": True}, - {"pattern":"processing grant .* id ([0-9]+)", - "name": "softirq gets first grant"}, - {"pattern":"*Finished queueing packet.* id ([0-9]+), offset ([0-9]+)", - "name": "last response packet sent", - "record_last": True, - "out_packet": True}, -] - -# Additional patterns to track packet copying separately. -aux_client_patterns = [ - {"pattern": "homa_sendmsg request.* id ([0-9]+)", - "name": "start"}, - {"pattern":"finished copy from user space for id ([0-9]+)", - "name": "finished copying req into pkts"}, - {"pattern":"starting copy to user space for id ([0-9]+)", - "name": "starting copying to user space"}, - {"pattern":"finished copying .* id ([0-9]+)", - "name": "finished copying to user space", - "record_last": True}, -] - -aux_server_patterns = [ - {"pattern":"homa_gro_receive got packet .* id ([0-9]+), offset 0", - "name": "gro gets first request packet"}, - {"pattern":"starting copy to user space for id ([0-9]+)", - "name": "starting copying to user space"}, - {"pattern":"finished copying .* id ([0-9]+)", - "name": "finished copying to user space", - "record_last": True}, - {"pattern":"finished copy from user space for id ([0-9]+)", - "name": "finished copying resp into pkts"}, -] - -def print_stats(patterns, rpcs): - """ - Print out a time line of when the events in patterns occur, using - data collected in rpcs - """ - for i in range(1, len(patterns)): - pattern = patterns[i] - elapsed = [] - deltas = [] - for id in rpcs: - rpc = rpcs[id] - if (0 not in rpc) or ((len(patterns)-1) not in rpc): - continue - if i not in rpc: - continue - elapsed.append(rpc[i] - rpc[0]) - prev = i - 1 - while not prev in rpc: - prev -= 1 - deltas.append(rpc[i] - rpc[prev]) - if len(elapsed) == 0: - print("%-32s (no events)" % (pattern["name"])) - continue - elapsed = sorted(elapsed) - deltas = sorted(deltas) - print("%-32s Avg %7.1f us (+%7.1f us) P90 %7.1f us (+%7.1f us)" % ( - pattern["name"], sum(elapsed)/len(elapsed), sum(deltas)/len(deltas), - elapsed[9*len(elapsed)//10], deltas[9*len(deltas)//10])) - -patterns = client_patterns -aux_patterns = aux_client_patterns -if (len(sys.argv) >= 2) and (sys.argv[1] == "--server"): - patterns = server_patterns - aux_patterns = aux_server_patterns - sys.argv.pop(1) -if len(sys.argv) == 2: - f = open(sys.argv[1]) -elif len(sys.argv) == 1: - f = sys.stdin -else: - print("Usage: %s [--server] [tt_file]" % (sys.argv[0])) - sys.exit(1) - -# Keys are RPC ids. Each value is a dictionary whose keys are indexes -# within patterns and whose values are the times when that event occurred. -rpcs = {} - -# Similar to rpcs, except records info about aux_patterns. -aux_rpcs = {} - -# Keys are RPC ids. Value is the last starting offset seen in a packet -# transmitted or received for that RPC (used to calculate throughputs). -last_out_offset = {} -last_in_offset = {} - -# Keys are RPC ids. Value represents time first or last data packet was -# sent or received -first_in_time = {} -first_out_time = {} -last_in_time = {} -last_out_time = {} - -# Keys are core ids. Value is the most recent time when a copy to user -# space was initiated on that core. -last_copy_out_start = {} - -# Keys are core ids. Value is the most recent time when a copy from user -# space was initiated on that core. -last_copy_in_start = {} - -# These variables track data copies into and out of the kernel -copy_in_data = 0 -copy_in_time = 0 -copy_out_data = 0 -copy_out_time = 0 - -# Copy out time that also includes time to delete skbuffs. -copy_out_time_with_del = 0 - -# A list containing the elapsed time for each invocation of ip_queue_xmit -# or ip6_xmit for a data packet -xmit_times = [] - -# For each core, time of most recent call to either ip_queue_xmit or ip6_xmit. -start_xmit = {} - -for line in f: - for i in range(len(patterns)): - pattern = patterns[i] - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - + pattern["pattern"], line) - if match: - time = float(match.group(1)) - id = int(match.group(4)) - if not id in rpcs: - rpcs[id] = {} - if (i in rpcs[id]) and (not "record_last" in pattern): - continue - rpcs[id][i] = time - # print("%8.3f: %s for id %d" % (time, names[i], id)) - if "first_in" in pattern: - first_in_time[id] = time - if "first_out" in pattern: - first_out_time[id] = time - if "in_packet" in pattern: - last_in_time[id] = time - last_in_offset[id] = int(match.group(5)) - if "out_packet" in pattern: - last_out_time[id] = time - last_out_offset[id] = int(match.group(5)) - for i in range(len(aux_patterns)): - pattern = aux_patterns[i] - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - + pattern["pattern"], line) - if match: - time = float(match.group(1)) - id = int(match.group(4)) - if not id in aux_rpcs: - aux_rpcs[id] = {} - if (i in aux_rpcs[id]) and (not "record_last" in pattern): - continue - aux_rpcs[id][i] = time - - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'starting copy to user space', line) - if match: - time = float(match.group(1)) - core = int(match.group(3)) - last_copy_out_start[core] = time - - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'starting copy from user space', line) - if match: - time = float(match.group(1)) - core = int(match.group(3)) - last_copy_in_start[core] = time - - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'finished copy from user space for id ([-0-9.]+), length ([-0-9.]+)', line) - if match: - time = float(match.group(1)) - core = int(match.group(3)) - id = match.group(4) - length = int(match.group(5)) - if core in last_copy_in_start: - copy_in_time += time - last_copy_in_start[core] - copy_in_data += length - - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'finished copy from user space', line) - if match: - time = float(match.group(1)) - core = int(match.group(3)) - last_copy_in_start[core] = time - - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'finished copying ([-0-9.]+) bytes for id ([-0-9.]+)', line) - if match: - time = float(match.group(1)) - count = int(match.group(4)) - core = int(match.group(3)) - if core in last_copy_out_start: - elapsed = time - last_copy_out_start[core] - copy_out_time += elapsed - copy_out_data += count - # print("%8.3f: %d bytes copied in %.1f usec: %.1f GB/sec" % ( - # qtime, count, elapsed, (count/1000)/elapsed)) - - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'finished freeing [0-9]+ skbs', line) - if match: - time = float(match.group(1)) - core = int(match.group(3)) - if core in last_copy_out_start: - copy_out_time_with_del += time - last_copy_out_start[core] - - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'calling .*_xmit: wire_bytes', line) - if match: - time = float(match.group(1)) - core = int(match.group(3)) - start_xmit[core] = time - - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'Finished queueing packet:', line) - if match: - time = float(match.group(1)) - core = int(match.group(3)) - if core in start_xmit: - xmit_times.append(time - start_xmit[core]) - -# Make sure aux_rpcs doesn't contain RPCs not in rpcs. -bad_ids = [] -for id in aux_rpcs: - if id in rpcs: - rpc = rpcs[id] - if (0 in rpc) and ((len(patterns)-1) in rpc): - continue - bad_ids.append(id) -for id in bad_ids: - del aux_rpcs[id] - -print_stats(patterns, rpcs) -print("") -print_stats(aux_patterns, aux_rpcs) - -print("\nTotal RPCs: %d" % (len(rpcs))) -avg_rpcs = 0 -last = len(patterns)-1 -for id in rpcs: - rpc = rpcs[id] - if 0 in rpc: - start = rpc[0] - else: - start = 0 - if last in rpc: - end = rpc[last] - else: - end = time - avg_rpcs += (end - start) / time - # print("RPC id %d: %.1f (%8.3f -> %8.3f)" % (id, end - start, start, end)) -print("Average active RPCS: %.1f" % (avg_rpcs)) - -out_data = 0 -out_time = 0 -in_data = 0 -in_time = 0 -for id in first_in_time: - if (id in last_in_time) and (id in last_in_offset): - in_data += last_in_offset[id] - in_time += last_in_time[id] - first_in_time[id] -for id in first_out_time: - if (id in last_out_time) and (id in last_out_offset): - out_data += last_out_offset[id] - out_time += last_out_time[id] - first_out_time[id] -print("Throughput:") -if out_time != 0: - print(" Transmit goodput (per RPC): %5.1f Gbps (%4.1f%% of total time)" % ( - 8e-03*out_data/out_time, - 100.0*out_time/end)) -print(" Transmit goodput (aggregate): %5.1f Gbps" % ( - 8e-03*out_data/end)) -if in_time != 0: - print(" Receive goodput (per RPC): %5.1f Gbps (%4.1f%% of total time)" % ( - 8e-03*in_data/in_time, - 100.0*in_time/end)) -print(" Receive goodput (aggregate): %5.1f Gbps" % ( - 8e-03*in_data/end)) -if copy_in_time != 0: - print(" Copy from user space (per thread): %5.1f Gbps (%4.1f%% of total time)" % ( - 8e-03*copy_in_data/copy_in_time, - 100.0*copy_in_time/end)) -if copy_out_time != 0: - print(" Copy to user space (per thread): %5.1f Gbps (%4.1f%% of total time)" % ( - 8e-03*copy_out_data/copy_out_time, - 100.0*copy_out_time/end)) -if copy_out_time_with_del != 0: - print(" Copy to user space (inc. skb gc): %5.1f Gbps (%4.1f%% of total time)" % ( - 8e-03*copy_out_data/copy_out_time_with_del, - 100.0*copy_out_time_with_del/end)) - -if len(xmit_times): - xmit_times = sorted(xmit_times) - print("\nAverage time to xmit packet: %.1f us (P0: %.1f, P50: %.1f, " - "P90: %.1f, P100: %.1f)" % (sum(xmit_times)/len(xmit_times), - xmit_times[0], xmit_times[len(xmit_times)//2], - xmit_times[9*len(xmit_times)//10], xmit_times[-1])) \ No newline at end of file diff --git a/util/ttskbs.py b/util/ttskbs.py deleted file mode 100755 index 547c24b1..00000000 --- a/util/ttskbs.py +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/python3 - -# Copyright (c) 2022 Stanford University -# -# Permission to use, copy, modify, and distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - -""" -Scans a time trace file to analyze the lifetimes of receive buffers -(e.g. how many are active at a time, how long they live, etc.) -Usage: ttskbs.py [--threshold t] [--verbose] [file] -The --threshold option specifies a time in usecs: info will be printed -for every buffer whose lifetime is at least that long. If --verbose is -specified then start and end times are printed for each buffer. -""" - -from __future__ import division, print_function -from glob import glob -from optparse import OptionParser -import math -import os -import re -import string -import sys - -threshold = 0.0 -verbose = False -f = sys.stdin - -while (len(sys.argv) > 1) and sys.argv[1].startswith("--"): - if sys.argv[1] == "--help": - print("Usage: %s [--threshold usecs] [file]" % (sys.argv[0])) - sys.exit(0) - if sys.argv[1] == "--verbose": - verbose = True - sys.argv.pop(1) - continue - if len(sys.argv) < 3: - print("Missing value for %s option" % (sys.argv[1])) - sys.exit(1) - if sys.argv[1] == "--threshold": - threshold = float(sys.argv[2]) - sys.argv.pop(1) - sys.argv.pop(1) -if len(sys.argv) >= 2: - f = open(sys.argv[1]) - -# Dictionary where each entry corresponds to a packet buffer -# currently in use; the key has the form "id:offset", and the -# value is the time when the packet was passed to homa_gro_receive -active_skbs = {} - -# Dictionary with one entry for each market buffer, indexed by -# buffer id of the form rpc_id:offset. Each value is a dictionary -# containing name-time entries for that buffer: -# gro: time when the buffer was seen by gro_receive -# softirq_start: homa_softirq woke up (eventually processed buffer) -# softirq: homa_softirq processed this buffer -# copy_start: homa_copy_out started processing a batch of -# buffers containing this one -# free: homa_copy_out freed this buffer -rpcs = {} - -num_active = 0 -max_active = 0 - -# List whose entries are the lifetimes of individual data packets. -lifetimes = [] - -# Dictionary where keys are core ids and values are the most recent time -# homa_softirq started executing on that core. -softirq_start = {} - -# Dictionary where keys are core ids and values are the most recent time -# homa_copy_out started executing on that core. -copy_out_start = {} - -for line in f: - match = re.match(' *([0-9.]+) us .* \[(C[0-9]+)\] (.*)', line) - if not match: - continue - time = float(match.group(1)) - core = match.group(2) - msg = match.group(3) - - match = re.match('.* id ([0-9.]+).*offset ([0-9.]+)', msg) - if match: - id = match.group(1) + ':' + match.group(2) - if not id in rpcs: - rpcs[id] = {} - rpc = rpcs[id] - - if "_gro_receive got packet" in msg: - rpc["gro"] = time - num_active += 1 - if num_active > max_active: - max_active = num_active - if verbose: - print("%9.3f: allocate %s (%d now active)" % ( - time, id, len(active_skbs))) - - if "gro" not in rpc: - continue - - if "homa_copy_out freeing skb" in msg: - rpc["copy_start"] = copy_out_start[core] - rpc["free"] = time - lifetime = time - rpc["gro"] - lifetimes.append(lifetime) - if (threshold > 0) and (lifetime >= threshold): - print("%9.3f: packet %s freed after %5.1f usec" % (time, - id, lifetime)) - num_active -= 1 - if verbose: - print("%9.3f: free %s (%d now active)" % (time, id, - num_active)) - - if "incoming data packet" in msg: - rpc["softirq_start"] = softirq_start[core] - rpc["softirq"] = time - - if "homa_softirq: first packet" in msg: - softirq_start[core] = time - - if "starting copy to user space" in msg: - copy_out_start[core] = time - -if len(lifetimes) == 0: - print("No packets found with complete life cycle") - exit(1) - -print("Maximum number of active skbs: %d" % (max_active)) - -# Lists of elapsed times from one event to another: -gro_to_softirq_start = [] -softirq_start_to_softirq = [] -softirq_to_copy_start = [] -copy_start_to_free = [] - -for key in rpcs.keys(): - rpc = rpcs[key] - if not "free" in rpc: - continue - gro_to_softirq_start.append(rpc["softirq_start"] - rpc["gro"]) - softirq_start_to_softirq.append(rpc["softirq"] - rpc["softirq_start"]) - softirq_to_copy_start.append(rpc["copy_start"] - rpc["softirq"]) - copy_start_to_free.append(rpc["free"] - rpc["copy_start"]) -gro_to_softirq_start = sorted(gro_to_softirq_start) -softirq_start_to_softirq = sorted(softirq_start_to_softirq) -softirq_to_copy_start = sorted(softirq_to_copy_start) -copy_start_to_free = sorted(copy_start_to_free) -lifetimes = sorted(lifetimes) - -print(" Duration (usecs)") -print("Phase of packet lifetime P10 P50 P90 Max") -print("----------------------------------------------------------------------") -l = len(gro_to_softirq_start) -print("GRO -> homa_softirq invocation: %5.1f %5.1f %5.1f %5.1f" % ( - gro_to_softirq_start[10*l//100], - gro_to_softirq_start[50*l//100], - gro_to_softirq_start[90*l//100], - gro_to_softirq_start[l-1])) -l = len(softirq_start_to_softirq) -print("homa_softirq_invocation -> SoftIRQ for packet %5.1f %5.1f %5.1f %5.1f" % ( - softirq_start_to_softirq[10*l//100], - softirq_start_to_softirq[50*l//100], - softirq_start_to_softirq[90*l//100], - softirq_start_to_softirq[l-1])) -l = len(softirq_to_copy_start) -print("SoftIRQ for packet -> copy_out invocation %5.1f %5.1f %5.1f %5.1f" % ( - softirq_to_copy_start[10*l//100], - softirq_to_copy_start[50*l//100], - softirq_to_copy_start[90*l//100], - softirq_to_copy_start[l-1])) -l = len(copy_start_to_free) -print("copy_out invocation -> packet free %5.1f %5.1f %5.1f %5.1f" % ( - copy_start_to_free[10*l//100], - copy_start_to_free[50*l//100], - copy_start_to_free[90*l//100], - copy_start_to_free[l-1])) -l = len(lifetimes) -print("End to end lifetime (GRO -> free) %5.1f %5.1f %5.1f %5.1f" % ( - lifetimes[10*l//100], - lifetimes[50*l//100], - lifetimes[90*l//100], - lifetimes[l-1])) diff --git a/util/ttsoftirq.py b/util/ttsoftirq.py deleted file mode 100755 index 77802e1f..00000000 --- a/util/ttsoftirq.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/python3 - -""" -Analyzes softirq wakeup times in a timetrace. -Usage: softirq.py [tt_file] - -The existing timetrace is in tt_file (or stdin in tt_file is omitted). -""" - -from __future__ import division, print_function -from glob import glob -from optparse import OptionParser -import math -import os -import re -import string -import sys - -if len(sys.argv) == 2: - f = open(sys.argv[1]) -elif len(sys.argv) == 1: - f = sys.stdin -else: - print("Usage: %s [tt_file]" % (sys.argv[0])) - sys.exit(1) - -queued = {} -delays = [] - -for line in f: - match = re.match(' *([-0-9.]+) us .* \[C([0-9]+)\]', line) - if not match: - continue - time = float(match.group(1)) - core = int(match.group(2)) - - match = re.match('.*enqueue_to_backlog.* cpu ([0-9]+)', line) - if match: - dest = int(match.group(1)) - queued[dest] = time - - match = re.match('.*homa_softirq: first packet', line) - if match: - if core in queued: - delay = time - queued[core] - delays.append(delay) - if delay > 10.0: - print("%9.3f Long SoftIRQ delay: %.1f usec (C%02d)" % - (time, delay, core)) - -delays.sort() -print("Minimum delay: %4.1f usec" % (delays[0])) -print("Median delay: %4.1f usec" % (delays[len(delays)//2])) -print("P90 delay: %4.1f usec" % (delays[len(delays)*9//10])) -print("P99 delay: %4.1f usec" % (delays[len(delays)*99//100])) -print("Maximum delay: %4.1f usec" % (delays[-1])) \ No newline at end of file diff --git a/util/ttsum.py b/util/ttsum.py index 49e800f0..a6822f75 100755 --- a/util/ttsum.py +++ b/util/ttsum.py @@ -1,25 +1,14 @@ #!/usr/bin/python3 -# Copyright (c) 2019-2022 Stanford University -# -# Permission to use, copy, modify, and distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Copyright (c) 2019-2022 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ This program reads one or more timetrace logs and generates summary information. Use the --help option to print usage information. """ -from __future__ import division, print_function +from collections import defaultdict from glob import glob from optparse import OptionParser import math @@ -31,7 +20,7 @@ # This variable collects all the times for all events, individually. It is # a dictionary that maps from key names to a list containing all of the # intervals for that event name (each interval is the elapsed time between -# the most recent previous event and this event). +# the previous event and this event). eventIntervals = {} # This variable collects information for all events relative to a given @@ -62,6 +51,19 @@ eventCount = {} +# Core number -> time of most recent event on that core. -1 means no +# events seen for that core yet. +corePrev = defaultdict(lambda : None) + +# Core number -> most recent time the "starting event" occurred on +# that core. +startTimes = defaultdict(lambda : None) + +# Core number -> dictionary mapping from event string to the number +# of times that event has occurred on the given core since the starting +# event. +eventCounts = defaultdict(lambda: defaultdict(lambda: 0)) + def scan(f, startingEvent): """ Scan the log file given by 'f' (handle for an open file) and collect @@ -71,23 +73,30 @@ def scan(f, startingEvent): other events, relative to the most recent occurrence of the starting event. """ - foundStart = False - startTime = 0.0 - lastTime = -1.0 + lastTime = None for line in f: - match = re.match('(^|.* )([0-9.]+) us \(\+ *([0-9.]+) us\) (.+)', line) + match = re.match(r'(^|.* )([0-9.]+) us \(\+ *([0-9.]+) us\) ' + r'\[C([0-9]+)\] (.+)', line) if not match: continue - thisEventTime = float(match.group(2))*1000.0 - thisEventInterval = float(match.group(3))*1000.0 - thisEvent = match.group(4) + thisEventTime = float(match.group(2)) + core = int(match.group(4)) + thisEvent = match.group(5) + if not options.useCores: + core = 0 + prevTime = corePrev[core] + if prevTime == None: + thisEventInterval = 0 + else: + thisEventInterval = thisEventTime - prevTime rawEvent = thisEvent if options.noNumbers: - thisEvent = re.sub('0x[0-9a-f]+', '?', thisEvent) - thisEvent = re.sub('[0-9]+', '?', thisEvent) - if (thisEventTime < lastTime): + thisEvent = re.sub(r'\b0x[0-9a-f]+\b', '?', thisEvent) + thisEvent = re.sub(r'\b[0-9.]+\b', '?', thisEvent) + if (lastTime != None) and (thisEventTime < lastTime): print('Time went backwards at the following line:\n%s' % (line)) lastTime = thisEventTime + corePrev[core] = thisEventTime if thisEventInterval != 0.0: if not thisEvent in eventIntervals: eventIntervals[thisEvent] = [] @@ -97,11 +106,11 @@ def scan(f, startingEvent): if startingEvent in rawEvent: # Reset variables to indicate that we are starting a new # sequence of events from the starting event. - startTime = thisEventTime - foundStart = True - eventCount = {} + startTimes[core] = thisEventTime + eventCounts[core] = defaultdict(lambda: 0) - if not foundStart: + startTime = startTimes[core] + if startTime == None: continue # If we get here, it means that we have found an event that @@ -109,13 +118,13 @@ def scan(f, startingEvent): # the starting event. First, see how many times this event has # occurred since the last occurrence of the starting event. relativeTime = thisEventTime - startTime - # print('%.1f %.1f %s' % (relativeTime, thisEventInterval, thisEvent)) - if thisEvent in eventCount: - count = eventCount[thisEvent] + 1 - else: - count = 1 - eventCount[thisEvent] = count - # print("Count for '%s': %d" % (thisEvent, count)) + # print('%9.3f: %.1f %.1f %s' % (thisEventTime, relativeTime, + # thisEventInterval, thisEvent)) + count = eventCounts[core][thisEvent] + 1 + eventCounts[core][thisEvent] = count + + # print("%9.3f: count for '%s': %d" % (thisEventTime, thisEvent, + # count)) if not thisEvent in relativeEvents: relativeEvents[thisEvent] = [] occurrences = relativeEvents[thisEvent] @@ -135,6 +144,12 @@ def scan(f, startingEvent): dest='altFormat', help='use alternate output format if -f is specified (print min, ' 'max, etc. for cumulative time, not delta)') +parser.add_option('-c', '--cores', action='store_true', default=False, + dest='useCores', + help='treat events on each core independently: compute elapsed time ' + 'for each event relative to the previous event on the same core, and ' + 'if -f is specified, compute relative times separately on each core ' + '(default: consider all events on all cores as a single stream)') parser.add_option('-f', '--from', type='string', dest='startEvent', help='measure times for other events relative to FROM; FROM contains a ' 'substring of an event') @@ -171,8 +186,8 @@ def scan(f, startingEvent): intervals.sort() medianTime = intervals[len(intervals)//2] message = '%-*s %6.0f %6.0f %6.0f %6.0f %7d' % (nameLength, - event, medianTime, intervals[0], intervals[-1], - sum(intervals)/len(intervals), len(intervals)) + event, medianTime * 1e03, intervals[0] * 1e03, intervals[-1] * 1e03, + 1e03 * sum(intervals)/len(intervals), len(intervals)) outputInfo.append([medianTime, message]) # Pass 2: sort in order of median interval length, then print. @@ -218,14 +233,15 @@ def scan(f, startingEvent): medianInterval = intervals[len(intervals)//2] if options.altFormat: message = '%-*s %6.0f %6.0f %6.0f %6.0f %6.0f %7d' % ( - nameLength, eventName, medianTime, times[0], times[-1], - sum(times)/len(times), intervals[len(intervals)//2], - len(times)) + nameLength, eventName, medianTime*1e03, times[0]*1e03, + times[-1] * 1e03, sum(times) * 1e03/len(times), + intervals[len(intervals)//2] * 1e03, len(times)) else: message = '%-*s %6.0f %6.0f %6.0f %6.0f %6.0f %7d' % ( - nameLength, eventName, medianTime, medianInterval, - intervals[0], intervals[-1], sum(intervals)/len(intervals), - len(intervals)) + nameLength, eventName, medianTime * 1e03, + medianInterval * 1e03, + intervals[0] * 1e03, intervals[-1] * 1e03, + 1e03 * sum(intervals)/len(intervals), len(intervals)) outputInfo.append([medianTime, message]) outputInfo.sort(key=lambda item: item[0]) diff --git a/util/ttsync.py b/util/ttsync.py index 27d76d88..8c651b13 100755 --- a/util/ttsync.py +++ b/util/ttsync.py @@ -1,154 +1,550 @@ #!/usr/bin/python3 -""" -Scans two timetraces covering the same time interval, one from a client and one -from a server, determines the clock offset between the two machines, and -outputs the server timetrace with its times adjusted so that they are -synchronized with the client timetrace. - -Usage: ttsync.py [--verbose] [client [server]] +# Copyright (c)2023 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ -The "client" and "server" arguments give the names of the two timetrace -files; they default to client.tt and server.tt. +""" +Scans two or more timetraces covering the same time interval, determines the +clock offsets between each machine and the first, and adjusts the times in +all of the traces except the first so that the clocks are synchronized +across the traces """ -from __future__ import division, print_function +from collections import defaultdict from glob import glob from optparse import OptionParser import math import os +from pathlib import Path import re import string import sys -from statistics import median - -client_trace = "client.tt" -server_trace = "server.tt" -verbose = False -if (len(sys.argv) >= 2) and (sys.argv[1] == "--help"): - print("Usage: %s [--verbose] [client_trace [server_trace]]" % (sys.argv[0])) - sys.exit(0) -if (len(sys.argv) >= 2) and (sys.argv[1] == "--verbose"): - verbose = True - sys.argv.pop(1) -if len(sys.argv) >= 2: - client_trace = sys.argv[1] - sys.argv.pop(1) -if len(sys.argv) >= 2: - server_trace = sys.argv[1] - -def parse_tt(tt, server): - """ - Reads the timetrace file given by tt and returns a dictionary containing - extracted statistics (see below). The server argument indicates whether - this is a server trace; if so, 1 gets subtracted from all RPC ids to - produce client ids. +import tempfile + +# Parse command line options +parser = OptionParser(description= + 'Read two or more timetraces, compute the clock offsets between them, ' + 'and rewrite all of the traces except the first to synchronize ' + 'their clocks. Also prints statistics about one-way packet delays.', + usage='%prog [options] t1 t2 ...', + conflict_handler='resolve') +parser.add_option('--no-rewrite', action='store_true', dest='no_rewrite', + default=False, metavar='T/F', help='read-only: compute offsets but ' + "don't rewrite trace files") +parser.add_option('--verbose', '-v', action='store_true', default=False, + dest='verbose', + help='print lots of output') + +(options, tt_files) = parser.parse_args() +if len(tt_files) < 2: + print('Need at least 2 trace files; run "ttsync.py --help" for help') + exit(1) + +# (rpc_id:offset) -> for each packet sent. rpc_id is the +# id on the sender, and node is the integer node identifier of the sender, +# as passed to parse_tt. +send_pkts = {} + +# (rpc_id:offset) -> for each packet received. rpc_id is the +# id on the sender, and node is the integer node identifier of the +# receiver, as passed to parse_tt. +recv_pkts = {} + +# (rpc_id:offset) -> 1 for each retransmitted packet. rpc_id is the +# id on the sender; used to ignore retransmits when syncing clocks +# (the retransmit can accidentally be paired with the receipt of the +# original packet). +retransmits = {} + +# node_num -> rpc_id -> . For each node number, contains +# a dictionary mapping from RPC identifiers to a list of unadjusted times +# when busy or resend packets were transmitted for rpc_id. Rpc_id the id on +# the sender. +send_ctl = defaultdict(lambda: defaultdict(list)) + +# rpc_id -> times. Times is a list of unadjusted times when resend or +# busy packets were received for rpc_id (rpc_id is the id on the receiver). +recv_ctl = defaultdict(list) + +# pkt_id -> for each TCP packet sent. pkt_id is a unique +# identifier for the packet: saddr:sport:daddr:dport:sequence:data_bytes:ack. +send_tcp = {} + +# pkt_id -> for each TCP packet received. pkt_id has the same +# structure as for send_tcp. +recv_tcp = {} + +# List of with one entry for each FREEZE packet +# sent. Time is the unadjusted time on the sender when the packet was sent. +# sender is the sender node index, and receiver is the receiver *address*. +send_freeze = [] + +# node_num -> for each FREEZE packet received. Time +# is the unadjusted time on the receiver when the last freeze packet +# was received by node_num. Ssender is the sender *address*. +recv_freeze = {} + +# This is an NxN array, where N is the number of nodes. min_delays[A][B] +# gives the smallest delay seen from node A to node B, as measured with +# their unadjusted clocks (one of these delays could be negative). +min_delays = [] + +# This is an NxN array, where N is the number of nodes. Each entry corresponds +# to an entry in min_delays, and gives the time when the message producing +# the minimum delay was received. +min_recv_times = [] + +# For each node, the offset to add to its clock value in order to synchronize +# its clock with node 0. +offsets = [] + +# rpc_id -> maximum offset that has been sent so far for that RPC; used to +# skip retransmitted packets, which can mess up delay calculations. +max_send_offsets = {} + +# rpc_id -> maximum offset that has been received so far for that RPC; used to +# skip receipts of retransmissions, which can mess up delay calculations. +max_recv_offsets = {} - The return value from parse_tt is a dictionary whose keys are packet - ids (rpc_id:offset). Each value is a dictionary containing some or all - of the following elements: +# rpc_id -> IP address of the node that sent or received packets with that ID. +id_addr = {} - send: time when ip_queue_xmit was called for that data packet - gro_recv: time when homa_gro_receive saw the packet +# rpc_id -> node index for the node that sent or received packets with that ID. +id_node_num = {} + +# IP address of node -> node's index in sync tables. +addr_node_num = {} + +def parse_tt(tt, node_num): """ + Reads a timetrace file and adds entries to send_pkts and recv_pkts. + Also updates num_records and last_time - global verbose - packets = {} + tt: Name of the timetrace file + node_num: Integer identifier for this file/node (should reflect the + order of the timetrace file in the arguments + """ + + global options, send_pkts, recv_pkts, max_send_offsets, max_recv_offsets + global retransmits sent = 0 recvd = 0 + num_records = 0 + first_time = None + last_time = None for line in open(tt): - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\]' - '.* id ([-0-9.]+),.* offset ([-0-9.]+)', line) + num_records += 1 + match = re.match(' *([-0-9.]+) us .* us\) \[C([0-9]+)\] (.*)', line) if not match: continue - time = float(match.group(1)) - core = int(match.group(3)) - id = match.group(4) - if (server): - id = str(int(id) - 1) - offset = match.group(5) - pktid = id + ":" + offset - - if not pktid in packets: - packets[pktid] = {} - - if re.match('.*calling .*_xmit: wire_bytes', line): - packets[pktid]["send"] = time + core = int(match.group(2)) + msg = match.group(3) + if first_time == None: + first_time = time + last_time = time + + match = re.match('.* id ([-0-9.]+),.* offset ([-0-9.]+)', msg) + if not match: + match = re.match('Transmitting TCP packet from (0x[a-f0-9]+) to ' + '(0x[a-f0-9]+), data bytes ([0-9]+), seq/ack ([0-9]+)', msg) + if match: + id = '%s:%s:%s:%s' % (match.group(1), match.group(2), + match.group(3), match.group(4)) + send_tcp[id] = [time, node_num] + sent += 1 + continue + + match = re.match('tcp_gro_receive got packet from ' + '(0x[a-f0-9]+) to (0x[a-f0-9]+), data bytes ([0-9]+), ' + 'seq/ack ([0-9]+)', msg) + if match: + id = '%s:%s:%s:%s' % (match.group(1), match.group(2), + match.group(3), match.group(4)) + recv_tcp[id] = [time, node_num] + recvd += 1 + continue + + match = re.match('retransmitting offset ([0-9.]+), .*id ([0-9.]+)', + msg) + if match: + offset = int(match.group(1)) + id = int(match.group(2)) + pktid = '%d:%d' % (id, offset) + retransmits[pktid] = 1 + continue + + match = re.match('sending BUSY from resend, id ([0-9]+),', msg) + if match: + id = match.group(1) + send_ctl[node_num][id].append(time) + continue + + match = re.match('[^ ]+ sent homa packet to ([^ ]+) id ([0-9]+), ' + 'type (0x[0-9a-f]+)', msg) + if match: + addr = match.group(1) + id = match.group(2) + type = match.group(3) + id_addr[peer_id(id)] = addr + id_node_num[id] = node_num + if type != '0x12' and type != '0x14': + continue + send_ctl[node_num][id].append(time) + continue + + match = re.match('homa_gro_receive got packet from ([^ ]+) id ' + '([0-9]+), type (0x[0-9a-f]+)', msg) + if match: + addr = match.group(1) + id = match.group(2) + type = match.group(3) + id_addr[peer_id(id)] = addr + id_node_num[id] = node_num + if type == '0x16': + recv_freeze[node_num] = [time, addr] + continue + if type != '0x12' and type != '0x14': + continue + recv_ctl[id].append(time) + continue + + match = re.match('Sending freeze to (0x[0-9a-f]+)', msg) + if match: + addr = match.group(1) + send_freeze.append([time, node_num, addr]) + continue + continue + + id = int(match.group(1)) + offset = int(match.group(2)) + + if re.match('.*calling .*_xmit: wire_bytes', msg): + if (id in max_send_offsets) and (max_send_offsets[id] >= offset): + continue + pktid = '%d:%d' % (id, offset) + if pktid in retransmits: + continue + send_pkts[pktid] = [time, node_num] + max_send_offsets[id] = offset sent += 1 - if "homa_gro_receive got packet" in line: - packets[pktid]["gro_recv"] = time + match2 = re.match('.*Finished queueing packet: rpc id .*, offset .*, ' + 'len ([0-9.]+)', msg) + if match2: + pktid = '%d:%d' % (id, offset) + if pktid in retransmits: + continue + last_offset = offset + int(match2.group(1)) - 1 + if (id in max_send_offsets) and (max_send_offsets[id] < last_offset): + max_send_offsets[id] = last_offset + continue + + if "homa_gro_receive got packet" in msg: + if (id in max_recv_offsets) and (max_recv_offsets[id] >= offset): + continue + pktid = '%d:%d' % (id^1, offset) + recv_pkts[pktid] = [time, node_num] + max_recv_offsets[id] = offset recvd += 1 + continue - if verbose: - print("%s trace has %d packet sends, %d receives" % ( - ("Server" if server else "Client"), sent, recvd), - file=sys.stderr) - return packets + if "sending grant for" in msg: + pktid = '%d:%dg' % (id, offset) + if not pktid in send_pkts: + send_pkts[pktid] = [time, node_num] + sent += 1 + continue + + if "homa_gro_receive got grant from" in msg: + pktid = '%d:%dg' % (id^1, offset) + recv_pkts[pktid] = [time, node_num] + recvd += 1 + continue + + match = re.match(r'Sent RESEND for client RPC id ([0-9]+), ' + r'server ([^:]+):', msg) + if False and match: + id = match.group(1) + addr = match.group(2) + id_addr[peer_id(id)] = addr + id_node_num[id] = node_num + send_ctl[node_num][id].append(time) + continue + + print('%-12s %8d %8d %8d %8.1f' % (tt, num_records, sent, recvd, + (last_time - first_time)/1000)) + +def find_min_delays(num_nodes): + """ + Combines the information in send_pkts, recv_pkts, send_tcp, and + recv_tcp to fill in min_delays + + num_nodes: Total number of distinct nodes; node numbers in + send_pkts and recv_pkts must be < num_nodes. + """ + + global min_delays, min_recv_times, send_pkts, recv_pkts + + min_delays = [[1e20 for i in range(num_nodes)] for j in range(num_nodes)] + min_recv_times = [[0 for i in range(num_nodes)] for j in range(num_nodes)] + + # Iterate over all the client-side events and match them to server-side + # events if possible. + for id, send_info in send_pkts.items(): + if not id in recv_pkts: + continue + send_time, send_node = send_info + recv_time, recv_node = recv_pkts[id] + delay = recv_time - send_time + if delay < min_delays[send_node][recv_node]: + min_delays[send_node][recv_node] = delay + min_recv_times[send_node][recv_node] = recv_time + + for id, send_info in send_tcp.items(): + if not id in recv_tcp: + continue + send_time, send_node = send_info + recv_time, recv_node = recv_tcp[id] + delay = recv_time - send_time + if delay < min_delays[send_node][recv_node]: + min_delays[send_node][recv_node] = delay + min_recv_times[send_node][recv_node] = recv_time + +def find_min_delays_alt(num_nodes): + """ + This function provides an alternate way to compute minimum delays, + using resend and busy packets instead of data and grant packets. It's + useful in situations where the cluster has stalled so there aren't any + data/grant packets. + """ + global send_ctl, recv_ctl, send_freeze, recv_freeze + global min_delays, min_recv_times, addr_node_num + + # Resend and busy packets are problematic because they are not unique: + # there can be several identical packets between the same pair of nodes. + # Here's how this function matches up sends and receives: + # * Start from freeze packets, which are unique; use them to compute + # an upper bound on delays in one direction. + # * Then scan packets flowing in the other direction: match sends and + # receives to pick the pair that produces the smallest positive RTT + # (when combined with freeze info in the other direction). + # * Then use this minimum in the other direction to match sends and + # recieves in the same direction as the freeze, to get a tighter bound + # that the freeze could produce by itself. + + for send_time, fsend_node, recv_addr in send_freeze: + # Compute freeze delay. + if not recv_addr in addr_node_num: + continue + frecv_node = addr_node_num[recv_addr] + if not frecv_node in recv_freeze: + continue + recv_time = recv_freeze[frecv_node][0] + freeze_delay = recv_time - send_time + if freeze_delay < min_delays[fsend_node][frecv_node]: + # print("New min delay %.1f us from %d to %d (freeze) send %.1f recv %.1f" % + # (freeze_delay, fsend_node, frecv_node, send_time, recv_time)) + min_delays[fsend_node][frecv_node] = freeze_delay + min_recv_times[fsend_node][frecv_node] = recv_time -def get_delays(p1, p2): + # Scan control packets in reverse direction from freeze. + min_delay = min_delays[frecv_node][fsend_node] + for id, send_times in send_ctl[frecv_node].items(): + id2 = peer_id(id) + if not id2 in id_node_num or id_node_num[id2] != fsend_node: + continue + for send in send_times: + for recv in recv_ctl[id2]: + delay = recv - send + if freeze_delay + delay > 0 and delay < min_delay: + # print("New min delay %.1f us (rtt %.1f) from %d " + # "to %d (reverse ctl) id %s send %.1f recv %.1f" % (delay, + # delay + freeze_delay, frecv_node, fsend_node, + # id, send, recv)) + min_delay = delay + min_delays[frecv_node][fsend_node] = delay + min_recv_times[frecv_node][fsend_node] = recv + + # Scan control packets in same direction as freeze. + reverse_delay = min_delay + if reverse_delay == 1e20: + continue + min_delay = min_delays[fsend_node][frecv_node] + for id, send_times in send_ctl[fsend_node].items(): + id2 = peer_id(id) + if not id2 in id_node_num or id_node_num[id2] != frecv_node: + continue + for send in send_times: + for recv in recv_ctl[id2]: + delay = recv - send + if reverse_delay + delay > 0 and delay < min_delay: + # print("New min delay %.1f us (rtt %.1f) from %d " + # "to %d (forward ctl) id %s send %.1f recv %.1f" % (delay, + # delay + reverse_delay, fsend_node, frecv_node, + # id, send, recv)) + min_delay = delay + min_delays[fsend_node][frecv_node] = delay + min_recv_times[fsend_node][frecv_node] = recv + +def get_node_num(tt_file): """ - Given two results from parse_tt, return a list containing all the - delays from a packet sent in p1 and received in p2. The list will - be sorted in increasing order. + Given a timetrace file name with a node number in it somewhere, + extract the number. + """ + match = re.match('[^0-9]*([0-9]+)', tt_file) + if match: + return int(match.group(1)) + return tt_file + +def peer_id(id): """ - delays = [] - for key in p1: - if not key in p2: + Given a (string) RPC identifier, return the identifier used for that RPC + on the peer node. + """ + + return str(int(id)^1) + +tt_files.sort(key = lambda name : get_node_num(name)) +node_names = [Path(tt_file).stem for tt_file in tt_files] +num_nodes = len(tt_files) +print('Trace file statistics:') +print('File: Name of trace file') +print('Records: Total number of timetrace records') +print('Sends: Total number of packets sent') +print('Receives: Total number of packets received (will be more than Sends') +print(' because of TSO)') +print('Timespan: Elapsed time between first and last timetrace records (ms)') +print('\nFile Records Sends Receives Timespan') +for i in range(num_nodes): + parse_tt(tt_files[i],i) +for id, addr in id_addr.items(): + if id in id_node_num: + addr_node_num[addr] = id_node_num[id] +find_min_delays(num_nodes) +find_min_delays_alt(num_nodes) + +# List of offset info for all nodes; index = node id; elements are +# dictionaries with the following entries: +# ref: Node that was used to synchronize this node. -1 means +# this node isn't yet synchronized. +# ref_offset: Amount to add to node's clock to sync with ref. +# offset: Amount to add to node's clock to sync with index 0. +offsets = [] +offsets.append({'ref': 0, 'ref_offset': 0.0, 'offset': 0.0}) +for i in range(1, num_nodes): + offsets.append({'ref': -1, 'ref_offset': 0.0, 'offset': 0.0}) + +# Compute clock offsets and min delays. In the simplest case the first +# node will be used as a reference for all the others, but this may not be +# possible if a node hasn't communicated with the first one. Also, the +# sync is likely to be inaccurate if the minimum RTT is very high. +# Each iteration through the following loop finds one node to sync, looking +# for a node that has a low RTT to one of the nodes that's already +# synced. +synced = 1 +while synced < num_nodes: + # Look for an unsynced node that we can sync. + best_node = None + best_ref = None + best_rtt = 1e20 + for node in range(1, num_nodes): + if offsets[node]['ref'] >= 0: continue - info1 = p1[key] - info2 = p2[key] - if (not "send" in info1) or (not "gro_recv" in info2): + # Look for a synced node that can be used as reference for node i. + for ref in range(0, num_nodes): + if offsets[ref]['ref'] < 0: + # This candidate isn't synced. + continue + if (min_delays[node][ref] > 1e10) or (min_delays[ref][node] > 1e10): + # No traffic between these nodes. + continue + # ref can potentially serve as reference for i. + rtt = min_delays[ref][node] + min_delays[node][ref] + if rtt < 0: + print('Negative RTT %.1f between %s (recv %.3f, delay %.3f) and ' + '%s (recv %.3f, delay %.3f),' % (rtt, node_names[ref], + min_recv_times[node][ref], min_delays[node][ref], + node_names[node], min_recv_times[ref][node], + min_delays[ref][node])) + if (rtt < best_rtt) and (rtt > 0): + best_node = node + best_ref = ref + best_rtt = rtt + if best_rtt < 15.0: + break + if best_rtt < 15.0: + break + if best_node == None: + # The remaining unsynced nodes can't be synced; print a message. + unsynced = [] + for i in range(1, num_nodes): + if offsets[i]['ref'] < 0: + unsynced.append(node_names[i]) + print('The following nodes couldn\'t be synced: %s (no traffic between\n' + 'these nodes and other nodes)' % + (', '.join(unsynced)), file=sys.stderr) + exit(1) + + ref_offset = best_rtt/2 - min_delays[best_ref][best_node] + offsets[best_node] = {'ref': best_ref, 'ref_offset': ref_offset, + 'offset': offsets[best_ref]['offset'] + ref_offset}; + synced += 1 + +print('\nTime offsets computed for each node:') +print('Ref: Reference node used to sync this node') +print('MinOut: Smallest time difference (unsynced clocks) for a packet') +print(' to get from Ref to this node') +print('MinBack: Smallest time difference (unsynced clocks) for a packet') +print(' to get from this node to Ref') +print('MinRTT: Minimum RTT (computed from MinOut and MinBack)') +print('RefOffset: Add this to node\'s clock to align with Ref') +print('Offset: Add this to node\'s clock to align with %s' % (node_names[0])) +print('\nNode Ref MinOut MinBack Min RTT RefOffset Offset') +print('%-10s %-10s %8.1f %8.1f %7.1f %8.1f %8.1f' % (node_names[0], "N/A", + 0.0, 0.0, 0.0, 0.0, 0.0)) +for node in range(1, num_nodes): + min_rtt = min_delays[0][node] + min_delays[node][0] + info = offsets[node] + ref = info['ref'] + min_rtt = min_delays[ref][node] + min_delays[node][ref] + print('%-10s %-10s %8.1f %8.1f %7.1f %8.1f %8.1f' % (node_names[node], + node_names[ref], min_delays[ref][node], min_delays[node][ref], + min_rtt, info['ref_offset'], info['offset'])) + +# Check for consistency (with these offsets, will all one-way delays be +# positive?) +for src in range(num_nodes): + for dst in range(num_nodes): + if src == dst: continue - delay = info2["gro_recv"] - info1["send"] - delays.append(delay) - return sorted(delays) - -client = parse_tt(client_trace, False) -server = parse_tt(server_trace, True) - -c_to_s = get_delays(client, server) -s_to_c = get_delays(server, client) - -if verbose: - print("Found %d packets from client to server, %d from server to client" % ( - len(c_to_s), len(s_to_c)), file=sys.stderr) - -min_rtt = (c_to_s[5*len(c_to_s)//100] + s_to_c[5*len(s_to_c)//100]) -print("RTT: P0 %.1f us, P5 %.1f us, P10 %.1fus, P20 %.1f us, P50 %.1f us" % ( - c_to_s[0] + s_to_c[0], min_rtt, - c_to_s[10*len(c_to_s)//100] + s_to_c[10*len(s_to_c)//100], - c_to_s[20*len(c_to_s)//100] + s_to_c[20*len(s_to_c)//100], - c_to_s[50*len(c_to_s)//100] + s_to_c[50*len(s_to_c)//100]), - file=sys.stderr) -offset = min_rtt/2 - c_to_s[5*len(c_to_s)//100] - -if verbose: - print("Server clock offset (assuming %.1f us RTT): %.1fus" % ( - min_rtt, offset), file=sys.stderr) - print("Client->server packet delays: min %.1f us, P50 %.1f us, " - "P90 %.1f us, P99 %.1f us" % ( - c_to_s[0] + offset, - c_to_s[len(c_to_s)//2] + offset, - c_to_s[len(c_to_s)*9//10] + offset, - c_to_s[len(c_to_s)*99//100] + offset), file=sys.stderr) - print("Server->client packet delays: min %.1f us, P50 %.1f us, " - "P90 %.1f us, P99 %.1f us" % ( - s_to_c[0] - offset, - s_to_c[len(s_to_c)//2] - offset, - s_to_c[len(s_to_c)*9//10] - offset, - s_to_c[len(s_to_c)*99//100] - offset), file=sys.stderr) - -# Now re-read the server's trace and output a new trace whose -# clock is aligned with the client. - -for line in open(server_trace): - match = re.match(' *([-0-9.]+) us (\(\+ *[-0-9.]+ us\) \[C[0-9]+\].*)', - line) - if not match: - print(line) - else: - time = float(match.group(1)) + offset - print("%9.3f us %s" % (time, match.group(2))) \ No newline at end of file + src_offset = offsets[src]['offset'] + dst_offset = offsets[dst]['offset'] + new_min = min_delays[src][dst] + dst_offset - src_offset + if new_min < 0: + print('Problematic offsets for %s (%.1f) and %s (%.1f)' + %(node_names[src], src_offset, node_names[dst], dst_offset)) + print(' mimimum delay %.1f becomes %.1f, received at %9.3f' % + (min_delays[src][dst], new_min, + min_recv_times[src][dst] + dst_offset)) + +# Rewrite traces with synchronized times +if not options.no_rewrite: + print("") + for i in range(1, num_nodes): + offset = offsets[i]['offset'] + src = open(tt_files[i]) + dst = tempfile.NamedTemporaryFile(dir=os.path.dirname(tt_files[i]), + mode='w', encoding='utf-8', delete=False) + print("Rewriting %s with offset %.1f usec" % (tt_files[i], offset)) + for line in src: + match = re.match(' *([-0-9.]+) us (\(\+ *[-0-9.]+ us\) \[C[0-9]+\].*)', + line) + if not match: + print(line, file=dst) + else: + time = float(match.group(1)) + offset + dst.write('%9.3f us %s\n' % (time, match.group(2))) + dst.close() + os.rename(dst.name, tt_files[i]) diff --git a/util/ttsyslog.py b/util/ttsyslog.py new file mode 100755 index 00000000..e6e1260f --- /dev/null +++ b/util/ttsyslog.py @@ -0,0 +1,85 @@ +#!/usr/bin/python3 + +# Copyright (c) 2019-2022 Homa Developers +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ + +""" +This program reads timetrace information that was printk-ed to the +system log, removing extraneous syslog information and printing it +out with times in microseconds instead of clock cycles. + +Usage: +ttsyslog.py [--extra file2] [file] + +If no file is given, the information is read from standard input. +If "--extra file2" is specified, all of the lines that are *not* valid +timetrace records are output to file file2. +""" + +from __future__ import division, print_function +from glob import glob +from optparse import OptionParser +import math +import os +import re +import string +import sys + +# Clock cycles per nanosecond. +cpu_ghz = None + +# Time in cycles of first event. +first_time = 0 + +# Time in cycles of previous event. +prev_time = 0 + +extra = None +if (len(sys.argv) > 2): + if sys.argv[1] == '--extra': + extra = open(sys.argv[2], 'w') + del sys.argv[1:3] +f = sys.stdin +if len(sys.argv) > 1: + f = open(sys.argv[1]) + +lines = [] + +for line in f: + line = line.rstrip() + if line.endswith('^M'): + line = line[:-2] + + if cpu_ghz == None: + match = re.match('.*cpu_khz: ([0-9.]+)', line) + if match: + cpu_ghz = float(match.group(1))*1e-06 + continue + + lines.append(line) + +for line in reversed(lines): + match = re.match('.* ([0-9.]+) (\[C..\] .+)', line) + if not match: + continue + this_time = float(match.group(1)) + this_event = match.group(2) + if first_time == 0.0: + first_time = this_time + prev_time = this_time + print('%9.3f us (+%8.3f us) [C00] First event has timestamp %s ' + '(cpu_ghz %.15f)' % (0, 0, match.group(1), cpu_ghz)) + print('%9.3f us (+%8.3f us) %s' % ( + (this_time - first_time)/(1000.0 *cpu_ghz), + (this_time - prev_time)/(1000.0 * cpu_ghz), this_event)) + prev_time = this_time + +if extra: + for line in lines: + if not re.match('.* ([0-9.]+) (\[C..\] .+)', line): + extra.write(line) + extra.write('\n') + extra.close() + +if cpu_ghz == None: + print("Couldn't find initial line with clock speed", file=sys.stderr) \ No newline at end of file diff --git a/util/ttxmit.py b/util/ttxmit.py deleted file mode 100755 index d794e13c..00000000 --- a/util/ttxmit.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/python3 - -""" -Analyzes packet transmissions in a timetrace to find gaps where the -uplink was unnecessarily idle. - -Usage: ttxmit.py [--verbose] [--gbps n] [trace] - -If no timetrace file is given, this script reads timetrace info from stdin. -""" - -from __future__ import division, print_function -from glob import glob -from optparse import OptionParser -import math -from operator import itemgetter -import os -import re -import string -import sys -from statistics import median - -# Parse command line options -parser = OptionParser(description= - 'Read a timetrace and output information about gaps in data packet ' - 'transmissions.', - usage='%prog [options] [trace]', - conflict_handler='resolve') -parser.add_option('--verbose', '-v', action='store_true', default=False, - dest='verbose', - help='print lots of output') -parser.add_option('--gbps', type='int', dest='gbps', default=25, - help='network speed in Gbps') - -(options, extra) = parser.parse_args() -f = sys.stdin -if len(extra) > 0: - f = open(extra[0]) - if len(extra) > 1: - print("Unrecognized argument %s" % (extra[1])) - exit(1) - -# Time when all of the output packets presented to the NIC will have -# been fully transmitted. -idle_time = 0 - -# Will eventually hold the amount of data in a full-sized output -# packet (before GSO chops it up). -packet_size = 1000 - -# Dictionary holding one entry for each RPC that is currently active -# (some of its bytes have been transmitted, but not all). Index is -# RPC id, value is a list giving time when most recent -# packet was transmitted for the RPC, offset of the packet's data. -active_rpcs = {} - -# Total number of RPCs that completed during the trace. -completed_rpcs = 0 - -# Total time when there was at least one active RPC. -active_usecs = 0 - -# Total time in all gaps -gap_usecs = 0 - -# Time when len(total_active_time) went from 0 to 1. -active_start = 0 - -# Time when len(total_active_time) become 0. -active_end = 0 - -# Total number of data packets sent. -total_packets = 0 - -# Total amount of data transmitted. -total_bytes = 0 - -# Total number of packets that experienced gaps >= long_gap. -long_gaps = 0 - -# Threshold length for a gap to be considered "long". -long_gap = 2.0 - -# One entry for each period of time when the uplink was idle yet there -# were active outgoing RPCs. Value is a list : duration is the length of the gap, start end end give the range of -# the idle period, active counts the number of active RPCs at the end of the -# interval, and id and offset identify the packet whose transmission ended -# the gap. -gaps = [] - -# Holds the duration of all the gaps that were caused by lack of grants. -grant_gaps = [] - -# One entry for each period of time when there were no active RPCS. -# Each entry is a list : duration is the length -# of the gap, start and end give the range, and id identifies the RPC that -# ended the gap. -inactive_gaps = [] - -# Keys are RPC ids; each value is the total number of bytes granted for -# that RPC (i.e. the index of the first byte not yet granted). -granted = {} - -# Keys are RPC ids, values are meaningless. If an entry is present, it -# means that the most recently transmitted packet used up all of the -# granted bytes, so the next packet will have to wait for a grant. -needs_grant = {} - -for line in f: - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'calling .*_xmit: wire_bytes ([0-9]+), .* id ([0-9]+), ' - 'offset ([0-9]+)', line) - if match: - time = float(match.group(1)) - core = match.group(3) - length = int(match.group(4)) - id = match.group(5) - offset = int(match.group(6)) - - total_packets += 1 - total_bytes += length - if packet_size < length: - packet_size = length - - if (idle_time < time) and (len(active_rpcs) > 0): - gap_length = time - idle_time - gaps.append([gap_length, idle_time, time, len(active_rpcs), id, offset]) - gap_usecs += gap_length - if gap_length >= long_gap: - long_gaps += 1 - if id in needs_grant: - grant_gaps.append(gap_length) - - if (id in granted) and ((offset + length) >= granted[id]): - needs_grant[id] = True - else: - needs_grant.pop(id, None) - - if len(active_rpcs) == 0: - if idle_time < time: - active_start = time - if active_end != 0: - inactive_gaps.append([time - active_end, active_end, time, id]) - else: - active_start = idle_time - - xmit_time = (length * 8)/(options.gbps * 1000) - if (idle_time < time): - idle_time = time + xmit_time - else: - idle_time += xmit_time - - if length < packet_size: - active_rpcs.pop(id, None) - completed_rpcs += 1 - else: - active_rpcs[id] = [time, id] - - if len(active_rpcs) == 0: - active_usecs += idle_time - active_start - active_end = idle_time - - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'processing grant for id ([0-9]+), offset ([0-9]+)', line) - if match: - id = match.group(4) - offset = int(match.group(5)) - granted[id] = offset - - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'starting copy from user .* id ([0-9]+),.* unscheduled ([0-9]+)', - line) - if match: - id = match.group(4) - unsched = int(match.group(5)) - granted[id] = unsched - -if len(active_rpcs): - active_usecs += time - active_start - -print("RPC active time: %9.1f us (%.1f%% of elapsed time)" % ( - active_usecs, 100.0*active_usecs/time)) -print("Total xmit gaps: %9.1f us (%.1f%% of active time)" % ( - gap_usecs, 100.0*gap_usecs/active_usecs)) -print("Average xmit gap: %9.1f us" % (gap_usecs/total_packets)) -grant_gap_usecs = sum(grant_gaps) -print("Gaps caused by delayed grants: %9.1f us (%.1f%% of all gap time)" % ( - grant_gap_usecs, 100.0*grant_gap_usecs/gap_usecs)) -print("%d data packets (%.1f%% of all packets) were delayed waiting for grants" - % (len(grant_gaps), 100*len(grant_gaps)/total_packets)) -print('%d data packets (%.1f%% of all packets) were delayed by gaps ' - '>= %.1f us' % (long_gaps, 100*long_gaps/ total_packets, - long_gap)) -print("Network bandwidth consumed when RPCs active: %.1f Gbps" % ( - total_bytes*8.0/(active_usecs*1e03))) -if (completed_rpcs > 0): - print("Average delay/RPC caused by missing grants: %.1f usec" % ( - grant_gap_usecs/completed_rpcs)) - -gaps = sorted(gaps, key=itemgetter(0), reverse=True) -print("\nLongest gaps:") -count = 0 -for gap in gaps: - print("%9.3f: gap of %5.1f us (starting at %9.3f), id %s, offset %d" % ( - gap[2], gap[0], gap[1], gap[4], gap[5])) - count += 1 - if count >= 10: - break - -gaps.reverse() -print("\nGap CDF (% of total gap time in gaps <= given size):") -print("Percent Gap") -pctl = 0 -total_usecs = 0 -for gap in gaps: - total_usecs += gap[0] - if (total_usecs >= pctl*gap_usecs/100): - print("%5d %5.1f us" % (pctl, gap[0])) - pctl += 10 -if pctl <= 100: - print("%5d %5.1f us" % (100, gaps[-1][0])) - -if len(grant_gaps) > 0: - grant_gaps = sorted(grant_gaps) - print("\nCDF of gaps caused by grants (% of total grant gap time " - "in gaps <= given size):") - print("Percent Gap") - pctl = 0 - total_usecs = 0 - for gap in grant_gaps: - total_usecs += gap - if (total_usecs >= pctl*grant_gap_usecs/100): - print("%5d %5.1f us" % (pctl, gap)) - pctl += 10 - if pctl <= 100: - print("%5d %5.1f us" % (100, grant_gaps[-1])) - -if inactive_gaps: - inactive_gaps = sorted(inactive_gaps, key=itemgetter(0), reverse=True) - print("\nLongest intervals with no active RPCs:") - count = 0 - for gap in inactive_gaps: - print("%9.3f: %5.1f us starting at %9.3f, ending with id %s" % ( - gap[2], gap[0], gap[1], gap[3])) - count += 1 - if count >= 10: - break \ No newline at end of file diff --git a/util/use_memory.c b/util/use_memory.c index 12d26c67..d6c82c98 100644 --- a/util/use_memory.c +++ b/util/use_memory.c @@ -1,16 +1,5 @@ -/* Copyright (c) 2019-2022 Stanford University - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/* Copyright (c) 2019-2022 Homa Developers + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This program allocates a given amount of memory and then sleeps