| 1 | #ifndef IO_URING_TYPES_H |
| 2 | #define IO_URING_TYPES_H |
| 3 | |
| 4 | #include <linux/blkdev.h> |
| 5 | #include <linux/hashtable.h> |
| 6 | #include <linux/task_work.h> |
| 7 | #include <linux/bitmap.h> |
| 8 | #include <linux/llist.h> |
| 9 | #include <uapi/linux/io_uring.h> |
| 10 | |
| 11 | enum { |
| 12 | /* |
| 13 | * A hint to not wake right away but delay until there are enough of |
| 14 | * tw's queued to match the number of CQEs the task is waiting for. |
| 15 | * |
| 16 | * Must not be used with requests generating more than one CQE. |
| 17 | * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set. |
| 18 | */ |
| 19 | IOU_F_TWQ_LAZY_WAKE = 1, |
| 20 | }; |
| 21 | |
| 22 | enum io_uring_cmd_flags { |
| 23 | IO_URING_F_COMPLETE_DEFER = 1, |
| 24 | IO_URING_F_UNLOCKED = 2, |
| 25 | /* the request is executed from poll, it should not be freed */ |
| 26 | IO_URING_F_MULTISHOT = 4, |
| 27 | /* executed by io-wq */ |
| 28 | IO_URING_F_IOWQ = 8, |
| 29 | /* int's last bit, sign checks are usually faster than a bit test */ |
| 30 | IO_URING_F_NONBLOCK = INT_MIN, |
| 31 | |
| 32 | /* ctx state flags, for URING_CMD */ |
| 33 | IO_URING_F_SQE128 = (1 << 8), |
| 34 | IO_URING_F_CQE32 = (1 << 9), |
| 35 | IO_URING_F_IOPOLL = (1 << 10), |
| 36 | |
| 37 | /* set when uring wants to cancel a previously issued command */ |
| 38 | IO_URING_F_CANCEL = (1 << 11), |
| 39 | IO_URING_F_COMPAT = (1 << 12), |
| 40 | IO_URING_F_TASK_DEAD = (1 << 13), |
| 41 | }; |
| 42 | |
| 43 | struct io_wq_work_node { |
| 44 | struct io_wq_work_node *next; |
| 45 | }; |
| 46 | |
| 47 | struct io_wq_work_list { |
| 48 | struct io_wq_work_node *first; |
| 49 | struct io_wq_work_node *last; |
| 50 | }; |
| 51 | |
| 52 | struct io_wq_work { |
| 53 | struct io_wq_work_node list; |
| 54 | atomic_t flags; |
| 55 | /* place it here instead of io_kiocb as it fills padding and saves 4B */ |
| 56 | int cancel_seq; |
| 57 | }; |
| 58 | |
| 59 | struct io_rsrc_data { |
| 60 | unsigned int nr; |
| 61 | struct io_rsrc_node **nodes; |
| 62 | }; |
| 63 | |
| 64 | struct io_file_table { |
| 65 | struct io_rsrc_data data; |
| 66 | unsigned long *bitmap; |
| 67 | unsigned int alloc_hint; |
| 68 | }; |
| 69 | |
| 70 | struct io_hash_bucket { |
| 71 | struct hlist_head list; |
| 72 | } ____cacheline_aligned_in_smp; |
| 73 | |
| 74 | struct io_hash_table { |
| 75 | struct io_hash_bucket *hbs; |
| 76 | unsigned hash_bits; |
| 77 | }; |
| 78 | |
| 79 | struct io_mapped_region { |
| 80 | struct page **pages; |
| 81 | void *ptr; |
| 82 | unsigned nr_pages; |
| 83 | unsigned flags; |
| 84 | }; |
| 85 | |
| 86 | /* |
| 87 | * Arbitrary limit, can be raised if need be |
| 88 | */ |
| 89 | #define IO_RINGFD_REG_MAX 16 |
| 90 | |
| 91 | struct io_uring_task { |
| 92 | /* submission side */ |
| 93 | int cached_refs; |
| 94 | const struct io_ring_ctx *last; |
| 95 | struct task_struct *task; |
| 96 | struct io_wq *io_wq; |
| 97 | struct file *registered_rings[IO_RINGFD_REG_MAX]; |
| 98 | |
| 99 | struct xarray xa; |
| 100 | struct wait_queue_head wait; |
| 101 | atomic_t in_cancel; |
| 102 | atomic_t inflight_tracked; |
| 103 | struct percpu_counter inflight; |
| 104 | |
| 105 | struct { /* task_work */ |
| 106 | struct llist_head task_list; |
| 107 | struct callback_head task_work; |
| 108 | } ____cacheline_aligned_in_smp; |
| 109 | }; |
| 110 | |
| 111 | struct iou_vec { |
| 112 | union { |
| 113 | struct iovec *iovec; |
| 114 | struct bio_vec *bvec; |
| 115 | }; |
| 116 | unsigned nr; /* number of struct iovec it can hold */ |
| 117 | }; |
| 118 | |
| 119 | struct io_uring { |
| 120 | u32 head; |
| 121 | u32 tail; |
| 122 | }; |
| 123 | |
| 124 | /* |
| 125 | * This data is shared with the application through the mmap at offsets |
| 126 | * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. |
| 127 | * |
| 128 | * The offsets to the member fields are published through struct |
| 129 | * io_sqring_offsets when calling io_uring_setup. |
| 130 | */ |
| 131 | struct io_rings { |
| 132 | /* |
| 133 | * Head and tail offsets into the ring; the offsets need to be |
| 134 | * masked to get valid indices. |
| 135 | * |
| 136 | * The kernel controls head of the sq ring and the tail of the cq ring, |
| 137 | * and the application controls tail of the sq ring and the head of the |
| 138 | * cq ring. |
| 139 | */ |
| 140 | struct io_uring sq, cq; |
| 141 | /* |
| 142 | * Bitmasks to apply to head and tail offsets (constant, equals |
| 143 | * ring_entries - 1) |
| 144 | */ |
| 145 | u32 sq_ring_mask, cq_ring_mask; |
| 146 | /* Ring sizes (constant, power of 2) */ |
| 147 | u32 sq_ring_entries, cq_ring_entries; |
| 148 | /* |
| 149 | * Number of invalid entries dropped by the kernel due to |
| 150 | * invalid index stored in array |
| 151 | * |
| 152 | * Written by the kernel, shouldn't be modified by the |
| 153 | * application (i.e. get number of "new events" by comparing to |
| 154 | * cached value). |
| 155 | * |
| 156 | * After a new SQ head value was read by the application this |
| 157 | * counter includes all submissions that were dropped reaching |
| 158 | * the new SQ head (and possibly more). |
| 159 | */ |
| 160 | u32 sq_dropped; |
| 161 | /* |
| 162 | * Runtime SQ flags |
| 163 | * |
| 164 | * Written by the kernel, shouldn't be modified by the |
| 165 | * application. |
| 166 | * |
| 167 | * The application needs a full memory barrier before checking |
| 168 | * for IORING_SQ_NEED_WAKEUP after updating the sq tail. |
| 169 | */ |
| 170 | atomic_t sq_flags; |
| 171 | /* |
| 172 | * Runtime CQ flags |
| 173 | * |
| 174 | * Written by the application, shouldn't be modified by the |
| 175 | * kernel. |
| 176 | */ |
| 177 | u32 cq_flags; |
| 178 | /* |
| 179 | * Number of completion events lost because the queue was full; |
| 180 | * this should be avoided by the application by making sure |
| 181 | * there are not more requests pending than there is space in |
| 182 | * the completion queue. |
| 183 | * |
| 184 | * Written by the kernel, shouldn't be modified by the |
| 185 | * application (i.e. get number of "new events" by comparing to |
| 186 | * cached value). |
| 187 | * |
| 188 | * As completion events come in out of order this counter is not |
| 189 | * ordered with any other data. |
| 190 | */ |
| 191 | u32 cq_overflow; |
| 192 | /* |
| 193 | * Ring buffer of completion events. |
| 194 | * |
| 195 | * The kernel writes completion events fresh every time they are |
| 196 | * produced, so the application is allowed to modify pending |
| 197 | * entries. |
| 198 | */ |
| 199 | struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; |
| 200 | }; |
| 201 | |
| 202 | struct io_restriction { |
| 203 | DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); |
| 204 | DECLARE_BITMAP(sqe_op, IORING_OP_LAST); |
| 205 | u8 sqe_flags_allowed; |
| 206 | u8 sqe_flags_required; |
| 207 | bool registered; |
| 208 | }; |
| 209 | |
| 210 | struct io_submit_link { |
| 211 | struct io_kiocb *head; |
| 212 | struct io_kiocb *last; |
| 213 | }; |
| 214 | |
| 215 | struct io_submit_state { |
| 216 | /* inline/task_work completion list, under ->uring_lock */ |
| 217 | struct io_wq_work_node free_list; |
| 218 | /* batch completion logic */ |
| 219 | struct io_wq_work_list compl_reqs; |
| 220 | struct io_submit_link link; |
| 221 | |
| 222 | bool plug_started; |
| 223 | bool need_plug; |
| 224 | bool cq_flush; |
| 225 | unsigned short submit_nr; |
| 226 | struct blk_plug plug; |
| 227 | }; |
| 228 | |
| 229 | struct io_alloc_cache { |
| 230 | void **entries; |
| 231 | unsigned int nr_cached; |
| 232 | unsigned int max_cached; |
| 233 | unsigned int elem_size; |
| 234 | unsigned int init_clear; |
| 235 | }; |
| 236 | |
| 237 | struct io_ring_ctx { |
| 238 | /* const or read-mostly hot data */ |
| 239 | struct { |
| 240 | unsigned int flags; |
| 241 | unsigned int drain_next: 1; |
| 242 | unsigned int restricted: 1; |
| 243 | unsigned int off_timeout_used: 1; |
| 244 | unsigned int drain_active: 1; |
| 245 | unsigned int has_evfd: 1; |
| 246 | /* all CQEs should be posted only by the submitter task */ |
| 247 | unsigned int task_complete: 1; |
| 248 | unsigned int lockless_cq: 1; |
| 249 | unsigned int syscall_iopoll: 1; |
| 250 | unsigned int poll_activated: 1; |
| 251 | unsigned int drain_disabled: 1; |
| 252 | unsigned int compat: 1; |
| 253 | unsigned int iowq_limits_set : 1; |
| 254 | |
| 255 | struct task_struct *submitter_task; |
| 256 | struct io_rings *rings; |
| 257 | struct percpu_ref refs; |
| 258 | |
| 259 | clockid_t clockid; |
| 260 | enum tk_offsets clock_offset; |
| 261 | |
| 262 | enum task_work_notify_mode notify_method; |
| 263 | unsigned sq_thread_idle; |
| 264 | } ____cacheline_aligned_in_smp; |
| 265 | |
| 266 | /* submission data */ |
| 267 | struct { |
| 268 | struct mutex uring_lock; |
| 269 | |
| 270 | /* |
| 271 | * Ring buffer of indices into array of io_uring_sqe, which is |
| 272 | * mmapped by the application using the IORING_OFF_SQES offset. |
| 273 | * |
| 274 | * This indirection could e.g. be used to assign fixed |
| 275 | * io_uring_sqe entries to operations and only submit them to |
| 276 | * the queue when needed. |
| 277 | * |
| 278 | * The kernel modifies neither the indices array nor the entries |
| 279 | * array. |
| 280 | */ |
| 281 | u32 *sq_array; |
| 282 | struct io_uring_sqe *sq_sqes; |
| 283 | unsigned cached_sq_head; |
| 284 | unsigned sq_entries; |
| 285 | |
| 286 | /* |
| 287 | * Fixed resources fast path, should be accessed only under |
| 288 | * uring_lock, and updated through io_uring_register(2) |
| 289 | */ |
| 290 | atomic_t cancel_seq; |
| 291 | |
| 292 | /* |
| 293 | * ->iopoll_list is protected by the ctx->uring_lock for |
| 294 | * io_uring instances that don't use IORING_SETUP_SQPOLL. |
| 295 | * For SQPOLL, only the single threaded io_sq_thread() will |
| 296 | * manipulate the list, hence no extra locking is needed there. |
| 297 | */ |
| 298 | bool poll_multi_queue; |
| 299 | struct io_wq_work_list iopoll_list; |
| 300 | |
| 301 | struct io_file_table file_table; |
| 302 | struct io_rsrc_data buf_table; |
| 303 | struct io_alloc_cache node_cache; |
| 304 | struct io_alloc_cache imu_cache; |
| 305 | |
| 306 | struct io_submit_state submit_state; |
| 307 | |
| 308 | /* |
| 309 | * Modifications are protected by ->uring_lock and ->mmap_lock. |
| 310 | * The flags, buf_pages and buf_nr_pages fields should be stable |
| 311 | * once published. |
| 312 | */ |
| 313 | struct xarray io_bl_xa; |
| 314 | |
| 315 | struct io_hash_table cancel_table; |
| 316 | struct io_alloc_cache apoll_cache; |
| 317 | struct io_alloc_cache netmsg_cache; |
| 318 | struct io_alloc_cache rw_cache; |
| 319 | struct io_alloc_cache cmd_cache; |
| 320 | |
| 321 | /* |
| 322 | * Any cancelable uring_cmd is added to this list in |
| 323 | * ->uring_cmd() by io_uring_cmd_insert_cancelable() |
| 324 | */ |
| 325 | struct hlist_head cancelable_uring_cmd; |
| 326 | /* |
| 327 | * For Hybrid IOPOLL, runtime in hybrid polling, without |
| 328 | * scheduling time |
| 329 | */ |
| 330 | u64 hybrid_poll_time; |
| 331 | } ____cacheline_aligned_in_smp; |
| 332 | |
| 333 | struct { |
| 334 | /* |
| 335 | * We cache a range of free CQEs we can use, once exhausted it |
| 336 | * should go through a slower range setup, see __io_get_cqe() |
| 337 | */ |
| 338 | struct io_uring_cqe *cqe_cached; |
| 339 | struct io_uring_cqe *cqe_sentinel; |
| 340 | |
| 341 | unsigned cached_cq_tail; |
| 342 | unsigned cq_entries; |
| 343 | struct io_ev_fd __rcu *io_ev_fd; |
| 344 | |
| 345 | void *cq_wait_arg; |
| 346 | size_t cq_wait_size; |
| 347 | } ____cacheline_aligned_in_smp; |
| 348 | |
| 349 | /* |
| 350 | * task_work and async notification delivery cacheline. Expected to |
| 351 | * regularly bounce b/w CPUs. |
| 352 | */ |
| 353 | struct { |
| 354 | struct llist_head work_llist; |
| 355 | struct llist_head retry_llist; |
| 356 | unsigned long check_cq; |
| 357 | atomic_t cq_wait_nr; |
| 358 | atomic_t cq_timeouts; |
| 359 | struct wait_queue_head cq_wait; |
| 360 | } ____cacheline_aligned_in_smp; |
| 361 | |
| 362 | /* timeouts */ |
| 363 | struct { |
| 364 | raw_spinlock_t timeout_lock; |
| 365 | struct list_head timeout_list; |
| 366 | struct list_head ltimeout_list; |
| 367 | unsigned cq_last_tm_flush; |
| 368 | } ____cacheline_aligned_in_smp; |
| 369 | |
| 370 | spinlock_t completion_lock; |
| 371 | |
| 372 | struct list_head cq_overflow_list; |
| 373 | |
| 374 | struct hlist_head waitid_list; |
| 375 | |
| 376 | #ifdef CONFIG_FUTEX |
| 377 | struct hlist_head futex_list; |
| 378 | struct io_alloc_cache futex_cache; |
| 379 | #endif |
| 380 | |
| 381 | const struct cred *sq_creds; /* cred used for __io_sq_thread() */ |
| 382 | struct io_sq_data *sq_data; /* if using sq thread polling */ |
| 383 | |
| 384 | struct wait_queue_head sqo_sq_wait; |
| 385 | struct list_head sqd_list; |
| 386 | |
| 387 | unsigned int file_alloc_start; |
| 388 | unsigned int file_alloc_end; |
| 389 | |
| 390 | /* Keep this last, we don't need it for the fast path */ |
| 391 | struct wait_queue_head poll_wq; |
| 392 | struct io_restriction restrictions; |
| 393 | |
| 394 | /* Stores zcrx object pointers of type struct io_zcrx_ifq */ |
| 395 | struct xarray zcrx_ctxs; |
| 396 | |
| 397 | u32 pers_next; |
| 398 | struct xarray personalities; |
| 399 | |
| 400 | /* hashed buffered write serialization */ |
| 401 | struct io_wq_hash *hash_map; |
| 402 | |
| 403 | /* Only used for accounting purposes */ |
| 404 | struct user_struct *user; |
| 405 | struct mm_struct *mm_account; |
| 406 | |
| 407 | /* ctx exit and cancelation */ |
| 408 | struct llist_head fallback_llist; |
| 409 | struct delayed_work fallback_work; |
| 410 | struct work_struct exit_work; |
| 411 | struct list_head tctx_list; |
| 412 | struct completion ref_comp; |
| 413 | |
| 414 | /* io-wq management, e.g. thread count */ |
| 415 | u32 iowq_limits[2]; |
| 416 | |
| 417 | struct callback_head poll_wq_task_work; |
| 418 | struct list_head defer_list; |
| 419 | unsigned nr_drained; |
| 420 | |
| 421 | struct io_alloc_cache msg_cache; |
| 422 | spinlock_t msg_lock; |
| 423 | |
| 424 | #ifdef CONFIG_NET_RX_BUSY_POLL |
| 425 | struct list_head napi_list; /* track busy poll napi_id */ |
| 426 | spinlock_t napi_lock; /* napi_list lock */ |
| 427 | |
| 428 | /* napi busy poll default timeout */ |
| 429 | ktime_t napi_busy_poll_dt; |
| 430 | bool napi_prefer_busy_poll; |
| 431 | u8 napi_track_mode; |
| 432 | |
| 433 | DECLARE_HASHTABLE(napi_ht, 4); |
| 434 | #endif |
| 435 | |
| 436 | /* protected by ->completion_lock */ |
| 437 | unsigned evfd_last_cq_tail; |
| 438 | unsigned nr_req_allocated; |
| 439 | |
| 440 | /* |
| 441 | * Protection for resize vs mmap races - both the mmap and resize |
| 442 | * side will need to grab this lock, to prevent either side from |
| 443 | * being run concurrently with the other. |
| 444 | */ |
| 445 | struct mutex mmap_lock; |
| 446 | |
| 447 | struct io_mapped_region sq_region; |
| 448 | struct io_mapped_region ring_region; |
| 449 | /* used for optimised request parameter and wait argument passing */ |
| 450 | struct io_mapped_region param_region; |
| 451 | }; |
| 452 | |
| 453 | /* |
| 454 | * Token indicating function is called in task work context: |
| 455 | * ctx->uring_lock is held and any completions generated will be flushed. |
| 456 | * ONLY core io_uring.c should instantiate this struct. |
| 457 | */ |
| 458 | struct io_tw_state { |
| 459 | }; |
| 460 | /* Alias to use in code that doesn't instantiate struct io_tw_state */ |
| 461 | typedef struct io_tw_state io_tw_token_t; |
| 462 | |
| 463 | enum { |
| 464 | REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, |
| 465 | REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, |
| 466 | REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, |
| 467 | REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, |
| 468 | REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, |
| 469 | REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, |
| 470 | REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT, |
| 471 | |
| 472 | /* first byte is taken by user flags, shift it to not overlap */ |
| 473 | REQ_F_FAIL_BIT = 8, |
| 474 | REQ_F_INFLIGHT_BIT, |
| 475 | REQ_F_CUR_POS_BIT, |
| 476 | REQ_F_NOWAIT_BIT, |
| 477 | REQ_F_LINK_TIMEOUT_BIT, |
| 478 | REQ_F_NEED_CLEANUP_BIT, |
| 479 | REQ_F_POLLED_BIT, |
| 480 | REQ_F_HYBRID_IOPOLL_STATE_BIT, |
| 481 | REQ_F_BUFFER_SELECTED_BIT, |
| 482 | REQ_F_BUFFER_RING_BIT, |
| 483 | REQ_F_REISSUE_BIT, |
| 484 | REQ_F_CREDS_BIT, |
| 485 | REQ_F_REFCOUNT_BIT, |
| 486 | REQ_F_ARM_LTIMEOUT_BIT, |
| 487 | REQ_F_ASYNC_DATA_BIT, |
| 488 | REQ_F_SKIP_LINK_CQES_BIT, |
| 489 | REQ_F_SINGLE_POLL_BIT, |
| 490 | REQ_F_DOUBLE_POLL_BIT, |
| 491 | REQ_F_MULTISHOT_BIT, |
| 492 | REQ_F_APOLL_MULTISHOT_BIT, |
| 493 | REQ_F_CLEAR_POLLIN_BIT, |
| 494 | /* keep async read/write and isreg together and in order */ |
| 495 | REQ_F_SUPPORT_NOWAIT_BIT, |
| 496 | REQ_F_ISREG_BIT, |
| 497 | REQ_F_POLL_NO_LAZY_BIT, |
| 498 | REQ_F_CAN_POLL_BIT, |
| 499 | REQ_F_BL_EMPTY_BIT, |
| 500 | REQ_F_BL_NO_RECYCLE_BIT, |
| 501 | REQ_F_BUFFERS_COMMIT_BIT, |
| 502 | REQ_F_BUF_NODE_BIT, |
| 503 | REQ_F_HAS_METADATA_BIT, |
| 504 | REQ_F_IMPORT_BUFFER_BIT, |
| 505 | |
| 506 | /* not a real bit, just to check we're not overflowing the space */ |
| 507 | __REQ_F_LAST_BIT, |
| 508 | }; |
| 509 | |
| 510 | typedef u64 __bitwise io_req_flags_t; |
| 511 | #define IO_REQ_FLAG(bitno) ((__force io_req_flags_t) BIT_ULL((bitno))) |
| 512 | |
| 513 | enum { |
| 514 | /* ctx owns file */ |
| 515 | REQ_F_FIXED_FILE = IO_REQ_FLAG(REQ_F_FIXED_FILE_BIT), |
| 516 | /* drain existing IO first */ |
| 517 | REQ_F_IO_DRAIN = IO_REQ_FLAG(REQ_F_IO_DRAIN_BIT), |
| 518 | /* linked sqes */ |
| 519 | REQ_F_LINK = IO_REQ_FLAG(REQ_F_LINK_BIT), |
| 520 | /* doesn't sever on completion < 0 */ |
| 521 | REQ_F_HARDLINK = IO_REQ_FLAG(REQ_F_HARDLINK_BIT), |
| 522 | /* IOSQE_ASYNC */ |
| 523 | REQ_F_FORCE_ASYNC = IO_REQ_FLAG(REQ_F_FORCE_ASYNC_BIT), |
| 524 | /* IOSQE_BUFFER_SELECT */ |
| 525 | REQ_F_BUFFER_SELECT = IO_REQ_FLAG(REQ_F_BUFFER_SELECT_BIT), |
| 526 | /* IOSQE_CQE_SKIP_SUCCESS */ |
| 527 | REQ_F_CQE_SKIP = IO_REQ_FLAG(REQ_F_CQE_SKIP_BIT), |
| 528 | |
| 529 | /* fail rest of links */ |
| 530 | REQ_F_FAIL = IO_REQ_FLAG(REQ_F_FAIL_BIT), |
| 531 | /* on inflight list, should be cancelled and waited on exit reliably */ |
| 532 | REQ_F_INFLIGHT = IO_REQ_FLAG(REQ_F_INFLIGHT_BIT), |
| 533 | /* read/write uses file position */ |
| 534 | REQ_F_CUR_POS = IO_REQ_FLAG(REQ_F_CUR_POS_BIT), |
| 535 | /* must not punt to workers */ |
| 536 | REQ_F_NOWAIT = IO_REQ_FLAG(REQ_F_NOWAIT_BIT), |
| 537 | /* has or had linked timeout */ |
| 538 | REQ_F_LINK_TIMEOUT = IO_REQ_FLAG(REQ_F_LINK_TIMEOUT_BIT), |
| 539 | /* needs cleanup */ |
| 540 | REQ_F_NEED_CLEANUP = IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT), |
| 541 | /* already went through poll handler */ |
| 542 | REQ_F_POLLED = IO_REQ_FLAG(REQ_F_POLLED_BIT), |
| 543 | /* every req only blocks once in hybrid poll */ |
| 544 | REQ_F_IOPOLL_STATE = IO_REQ_FLAG(REQ_F_HYBRID_IOPOLL_STATE_BIT), |
| 545 | /* buffer already selected */ |
| 546 | REQ_F_BUFFER_SELECTED = IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT), |
| 547 | /* buffer selected from ring, needs commit */ |
| 548 | REQ_F_BUFFER_RING = IO_REQ_FLAG(REQ_F_BUFFER_RING_BIT), |
| 549 | /* caller should reissue async */ |
| 550 | REQ_F_REISSUE = IO_REQ_FLAG(REQ_F_REISSUE_BIT), |
| 551 | /* supports async reads/writes */ |
| 552 | REQ_F_SUPPORT_NOWAIT = IO_REQ_FLAG(REQ_F_SUPPORT_NOWAIT_BIT), |
| 553 | /* regular file */ |
| 554 | REQ_F_ISREG = IO_REQ_FLAG(REQ_F_ISREG_BIT), |
| 555 | /* has creds assigned */ |
| 556 | REQ_F_CREDS = IO_REQ_FLAG(REQ_F_CREDS_BIT), |
| 557 | /* skip refcounting if not set */ |
| 558 | REQ_F_REFCOUNT = IO_REQ_FLAG(REQ_F_REFCOUNT_BIT), |
| 559 | /* there is a linked timeout that has to be armed */ |
| 560 | REQ_F_ARM_LTIMEOUT = IO_REQ_FLAG(REQ_F_ARM_LTIMEOUT_BIT), |
| 561 | /* ->async_data allocated */ |
| 562 | REQ_F_ASYNC_DATA = IO_REQ_FLAG(REQ_F_ASYNC_DATA_BIT), |
| 563 | /* don't post CQEs while failing linked requests */ |
| 564 | REQ_F_SKIP_LINK_CQES = IO_REQ_FLAG(REQ_F_SKIP_LINK_CQES_BIT), |
| 565 | /* single poll may be active */ |
| 566 | REQ_F_SINGLE_POLL = IO_REQ_FLAG(REQ_F_SINGLE_POLL_BIT), |
| 567 | /* double poll may active */ |
| 568 | REQ_F_DOUBLE_POLL = IO_REQ_FLAG(REQ_F_DOUBLE_POLL_BIT), |
| 569 | /* request posts multiple completions, should be set at prep time */ |
| 570 | REQ_F_MULTISHOT = IO_REQ_FLAG(REQ_F_MULTISHOT_BIT), |
| 571 | /* fast poll multishot mode */ |
| 572 | REQ_F_APOLL_MULTISHOT = IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT), |
| 573 | /* recvmsg special flag, clear EPOLLIN */ |
| 574 | REQ_F_CLEAR_POLLIN = IO_REQ_FLAG(REQ_F_CLEAR_POLLIN_BIT), |
| 575 | /* don't use lazy poll wake for this request */ |
| 576 | REQ_F_POLL_NO_LAZY = IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT), |
| 577 | /* file is pollable */ |
| 578 | REQ_F_CAN_POLL = IO_REQ_FLAG(REQ_F_CAN_POLL_BIT), |
| 579 | /* buffer list was empty after selection of buffer */ |
| 580 | REQ_F_BL_EMPTY = IO_REQ_FLAG(REQ_F_BL_EMPTY_BIT), |
| 581 | /* don't recycle provided buffers for this request */ |
| 582 | REQ_F_BL_NO_RECYCLE = IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT), |
| 583 | /* buffer ring head needs incrementing on put */ |
| 584 | REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT), |
| 585 | /* buf node is valid */ |
| 586 | REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT), |
| 587 | /* request has read/write metadata assigned */ |
| 588 | REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT), |
| 589 | /* |
| 590 | * For vectored fixed buffers, resolve iovec to registered buffers. |
| 591 | * For SEND_ZC, whether to import buffers (i.e. the first issue). |
| 592 | */ |
| 593 | REQ_F_IMPORT_BUFFER = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT), |
| 594 | }; |
| 595 | |
| 596 | typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw); |
| 597 | |
| 598 | struct io_task_work { |
| 599 | struct llist_node node; |
| 600 | io_req_tw_func_t func; |
| 601 | }; |
| 602 | |
| 603 | struct io_cqe { |
| 604 | __u64 user_data; |
| 605 | __s32 res; |
| 606 | /* fd initially, then cflags for completion */ |
| 607 | union { |
| 608 | __u32 flags; |
| 609 | int fd; |
| 610 | }; |
| 611 | }; |
| 612 | |
| 613 | /* |
| 614 | * Each request type overlays its private data structure on top of this one. |
| 615 | * They must not exceed this one in size. |
| 616 | */ |
| 617 | struct io_cmd_data { |
| 618 | struct file *file; |
| 619 | /* each command gets 56 bytes of data */ |
| 620 | __u8 data[56]; |
| 621 | }; |
| 622 | |
| 623 | static inline void io_kiocb_cmd_sz_check(size_t cmd_sz) |
| 624 | { |
| 625 | BUILD_BUG_ON(cmd_sz > sizeof(struct io_cmd_data)); |
| 626 | } |
| 627 | #define io_kiocb_to_cmd(req, cmd_type) ( \ |
| 628 | io_kiocb_cmd_sz_check(sizeof(cmd_type)) , \ |
| 629 | ((cmd_type *)&(req)->cmd) \ |
| 630 | ) |
| 631 | |
| 632 | static inline struct io_kiocb *cmd_to_io_kiocb(void *ptr) |
| 633 | { |
| 634 | return ptr; |
| 635 | } |
| 636 | |
| 637 | struct io_kiocb { |
| 638 | union { |
| 639 | /* |
| 640 | * NOTE! Each of the io_kiocb union members has the file pointer |
| 641 | * as the first entry in their struct definition. So you can |
| 642 | * access the file pointer through any of the sub-structs, |
| 643 | * or directly as just 'file' in this struct. |
| 644 | */ |
| 645 | struct file *file; |
| 646 | struct io_cmd_data cmd; |
| 647 | }; |
| 648 | |
| 649 | u8 opcode; |
| 650 | /* polled IO has completed */ |
| 651 | u8 iopoll_completed; |
| 652 | /* |
| 653 | * Can be either a fixed buffer index, or used with provided buffers. |
| 654 | * For the latter, it points to the selected buffer ID. |
| 655 | */ |
| 656 | u16 buf_index; |
| 657 | |
| 658 | unsigned nr_tw; |
| 659 | |
| 660 | /* REQ_F_* flags */ |
| 661 | io_req_flags_t flags; |
| 662 | |
| 663 | struct io_cqe cqe; |
| 664 | |
| 665 | struct io_ring_ctx *ctx; |
| 666 | struct io_uring_task *tctx; |
| 667 | |
| 668 | union { |
| 669 | /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ |
| 670 | struct io_buffer *kbuf; |
| 671 | |
| 672 | /* |
| 673 | * stores buffer ID for ring provided buffers, valid IFF |
| 674 | * REQ_F_BUFFER_RING is set. |
| 675 | */ |
| 676 | struct io_buffer_list *buf_list; |
| 677 | |
| 678 | struct io_rsrc_node *buf_node; |
| 679 | }; |
| 680 | |
| 681 | union { |
| 682 | /* used by request caches, completion batching and iopoll */ |
| 683 | struct io_wq_work_node comp_list; |
| 684 | /* cache ->apoll->events */ |
| 685 | __poll_t apoll_events; |
| 686 | }; |
| 687 | |
| 688 | struct io_rsrc_node *file_node; |
| 689 | |
| 690 | atomic_t refs; |
| 691 | bool cancel_seq_set; |
| 692 | struct io_task_work io_task_work; |
| 693 | union { |
| 694 | /* |
| 695 | * for polled requests, i.e. IORING_OP_POLL_ADD and async armed |
| 696 | * poll |
| 697 | */ |
| 698 | struct hlist_node hash_node; |
| 699 | /* For IOPOLL setup queues, with hybrid polling */ |
| 700 | u64 iopoll_start; |
| 701 | }; |
| 702 | /* internal polling, see IORING_FEAT_FAST_POLL */ |
| 703 | struct async_poll *apoll; |
| 704 | /* opcode allocated if it needs to store data for async defer */ |
| 705 | void *async_data; |
| 706 | /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ |
| 707 | atomic_t poll_refs; |
| 708 | struct io_kiocb *link; |
| 709 | /* custom credentials, valid IFF REQ_F_CREDS is set */ |
| 710 | const struct cred *creds; |
| 711 | struct io_wq_work work; |
| 712 | |
| 713 | struct io_big_cqe { |
| 714 | u64 ; |
| 715 | u64 ; |
| 716 | } big_cqe; |
| 717 | }; |
| 718 | |
| 719 | struct io_overflow_cqe { |
| 720 | struct list_head list; |
| 721 | struct io_uring_cqe cqe; |
| 722 | }; |
| 723 | |
| 724 | static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx) |
| 725 | { |
| 726 | return ctx->flags & IORING_SETUP_CQE32; |
| 727 | } |
| 728 | |
| 729 | #endif |
| 730 | |