or processes simultaneously, but performance degrades seriously once memory is exhausted or when high I/O load causes a large volume of when high I/O load causes a large volume of context switches context switches." https://www.nginx.com/blog/inside-nginx-how-we-designed-for-performance-scale/ 4
as there are clients currently connected, which has some disadvantages when server workload must scale to handle large numbers of connections. [...] Exhaustion of other resources can occur as well, and scheduling overhead can become scheduling overhead can become significant significant. https://dev.mysql.com/doc/refman/5.7/en/connection-threads.html 5
from one thread to another requires a full context switch [...]. This operation is slow, due its poor locality and the number of memory accesses required. [...] Because it doesn't need a switch to kernel context, rescheduling a rescheduling a goroutine is much cheaper than rescheduling a thread goroutine is much cheaper than rescheduling a thread." Donovan & Kernighan, The Go Programming Language 6
kernel scheduler work, anyways? What about userspace schedulers? Are they radically different? What design patterns do scheduler implementations follow? What tradeoffs do they make? Questions! 8
benchmark: # Executed 1000000 pipe operations between two threads Total time: 4.498 [sec] 4.498076 usecs/op 222317 ops/sec 㱺 upper bound: 2.25 µs per thread context switch (2 context switches per read-write "op") Conveniently, this is part of the perf bench suite in Linux: Is that our final answer? 12
1 microsecond (on this machine, with caveats, etc.) What did we learn? Meta-lessons Benchmarking is tricky. Can't just run random experiments -- need introspection into scheduler Helpful to have some idea how the scheduler works! 22
void schedule(void) { int c; struct task_struct *p, *next; c = -1000; next = p = &init_task; for (;;) { if ((p = p->next_task) == &init_task) break; if (p->state == TASK_RUNNING && p->counter > c) c = p->counter, next = p; } if (!c) { for_each_task(p) p->counter = (p->counter >> 1) + p->priority; } if (current == next) return; switch_to(next); } We'll keep a list of running tasks 24
void schedule(void) { int c; struct task_struct *p, *next; c = -1000; next = p = &init_task; for (;;) { if ((p = p->next_task) == &init_task) break; if (p->state == TASK_RUNNING && p->counter > c) c = p->counter, next = p; } if (!c) { for_each_task(p) p->counter = (p->counter >> 1) + p->priority; } if (current == next) return; switch_to(next); } We'll keep a list of running tasks And when we need to schedule 25
need to schedule Iterate through our tasks Keep a countdown for each task Pick the task with the lowest countdown to run next struct task_struct* init_task; struct task_struct * task[512] = {&init_task, }; void schedule(void) { int c; struct task_struct *p, *next; c = -1000; next = p = &init_task; for (;;) { if ((p = p->next_task) == &init_task) break; if (p->state == TASK_RUNNING && p->counter > c) c = p->counter, next = p; } if (!c) { for_each_task(p) p->counter = (p->counter >> 1) + p->priority; } if (current == next) return; switch_to(next); } 27
need to schedule Iterate through our tasks Keep a countdown for each task Pick the task with the lowest countdown to run next Decrement countdown for each task (hi-pri tasks count down faster) struct task_struct* init_task; struct task_struct * task[512] = {&init_task, }; void schedule(void) { int c; struct task_struct *p, *next; c = -1000; next = p = &init_task; for (;;) { if ((p = p->next_task) == &init_task) break; if (p->state == TASK_RUNNING && p->counter > c) c = p->counter, next = p; } if (!c) { for_each_task(p) p->counter = (p->counter >> 1) + p->priority; } if (current == next) return; switch_to(next); } 28
need to schedule Iterate through our tasks Keep a countdown for each task Pick the task with the lowest countdown to run next Decrement countdown for each task (hi-pri tasks count down faster) Then switch to the next task struct task_struct* init_task; struct task_struct * task[512] = {&init_task, }; void schedule(void) { int c; struct task_struct *p, *next; c = -1000; next = p = &init_task; for (;;) { if ((p = p->next_task) == &init_task) break; if (p->state == TASK_RUNNING && p->counter > c) c = p->counter, next = p; } if (!c) { for_each_task(p) p->counter = (p->counter >> 1) + p->priority; } if (current == next) return; switch_to(next); } 29
void schedule(void) { int c; struct task_struct *p, *next; c = -1000; next = p = &init_task; for (;;) { if ((p = p->next_task) == &init_task) break; if (p->state == TASK_RUNNING && p->counter > c) c = p->counter, next = p; } if (!c) { for_each_task(p) p->counter = (p->counter >> 1) + p->priority; } if (current == next) return; switch_to(next); } This is how the Linux scheduler worked in 1995. 30
struct task_struct * next; unsigned long ticks; /* check alarm, wake up any interruptible tasks that have got a signal */ if (intr_count) { printk("Aiee: scheduling in interrupt\n"); intr_count = 0; } cli(); ticks = itimer_ticks; itimer_ticks = 0; itimer_next = ~0; sti(); need_resched = 0; p = &init_task; for (;;) { if ((p = p->next_task) == &init_task) goto confuse_gcc1; if (ticks && p->it_real_value) { if (p->it_real_value <= ticks) { send_sig(SIGALRM, p, 1); if (!p->it_real_incr) { p->it_real_value = 0; goto end_itimer; } do { p->it_real_value += p->it_real_incr; } while (p->it_real_value <= ticks); } p->it_real_value -= ticks; if (p->it_real_value < itimer_next) itimer_next = p->it_real_value; } end_itimer: if (p->state != TASK_INTERRUPTIBLE) continue; if (p->signal & ~p->blocked) { p->state = TASK_RUNNING; continue; } if (p->timeout && p->timeout <= jiffies) { p->timeout = 0; p->state = TASK_RUNNING; } } confuse_gcc1: /* this is the scheduler proper: */ #if 0 /* give processes that go to sleep a bit higher priority.. */ /* This depends on the values for TASK_XXX */ /* This gives smoother scheduling for some things, but */ /* can be very unfair under some circumstances, so.. */ if (TASK_UNINTERRUPTIBLE >= (unsigned) current->state && current->counter < current->priority*2) { ++current->counter; } #endif c = -1000; next = p = &init_task; for (;;) { if ((p = p->next_task) == &init_task) goto confuse_gcc2; if (p->state == TASK_RUNNING && p->counter > c) c = p->counter, next = p; } confuse_gcc2: if (!c) { for_each_task(p) p->counter = (p->counter >> 1) + p->priority; } if (current == next) return; kstat.context_swtch++; switch_to(next); } struct task_struct* init_task; struct task_struct * task[512] = {&init_task, }; void schedule(void) { int c; struct task_struct *p, *next; c = -1000; next = p = &init_task; for (;;) { if ((p = p->next_task) == &init_task) break; if (p->state == TASK_RUNNING && p->counter > c) c = p->counter, next = p; } if (!c) { for_each_task(p) p->counter = (p->counter >> 1) + p->priority; } if (current == next) return; switch_to(next); } (okay, it was ~75 lines) 31
per-core basis (more about inter- CPU load balancing later). For each core, there's a runqueue of runnable tasks. This is actually a red-black tree, ordered by task vruntime (basically real runtime divided by task weight). As tasks run, they accumulate vruntime. (Note: We're talking about the 'fair' scheduler here. There are other, non-default scheduling policies too.) 33
current task is due for preemption, the timer handler sets a flag in the task's thread_info struct. Before returning to normal execution, we check that NEED_RESCHED flag, and call schedule() if we need to. 48
current task is due for preemption, the timer handler sets a flag in the task's thread_info struct. Before returning to normal execution, we check that NEED_RESCHED flag, and call schedule() if we need to. The schedule function dequeues the next task, enqueues the preempted one, swaps their processor state, and does some cleanup before actually running the next task. 49
weight (priority) than tasks A, B, D : Balancing runqueues based on length alone could deprive C of runtime. We could try balancing based on total task weight. 53
weight (priority) than tasks A, B, D : Balancing runqueues based on length alone could deprive C of runtime. We could try balancing based on total task weight. But if task C frequently sleeps, this is inefficient. So balancing uses a "load" metric based on task weight and task CPU utilization. 54
all these details? 3. Use ftrace, 'the function tracer' 1. Listen to some bozo's talk - Dynamically traces all* function entry/return points in the kernel! 2. Stare really hard at the source code *almost (not architecture-specific functions defined in assembly) 56
from one thread to another requires a full context switch [...]. This operation is slow, due its poor locality and the number of memory accesses required. [...] Because it doesn't need a switch to kernel context, rescheduling a rescheduling a goroutine is much cheaper than rescheduling a thread goroutine is much cheaper than rescheduling a thread." Donovan & Kernighan, The Go Programming Language The claim: 65
. . $ ./pingpong Elapsed: 4.184381 0.209219 µs per switch . . . it does look a good bit faster than thread switching. So what's the Go scheduler doing? 67
basically described by three data structures: An M represents an OS thread A G represents a goroutine A P represents general context for executing Go code. 68
An M represents an OS thread A G represents a goroutine A P represents general context for executing Go code. Each P contains a queue of runnable goroutines. 69
preempted or blocked in syscalls¹ go onto a special global runqueue. A P which becomes idle can steal work from another P. ¹This is only true in some cases, but it's not important. 73
preempted or blocked in syscalls¹ go onto a special global runqueue. A P which becomes idle can steal work from another P. ¹This is only true in some cases, but it's not important. 74
Processes communicate via asynchronous message passing (no shared state). Erlang code is compiled to bytecode and executed by a virtual machine. This architecture enables a simple preemption mechanism (not timer- or watcher-based). It uses the notion of a reduction budget. 83
Every operation costs reductions: - calling a function - sending a message to another process - I/O - garbage collection - etc. After you use up your reduction budget, you get preempted. 84
3700-line switch statemen // ... OpCase(i_call_f): { SET_CP(c_p, I+2); I = (BeamInstr *) Arg(0)); Dispatch(); } // ... } #define Dispatch() do { dis_next = (BeamInstr *) *I; if (REDUCTIONS > 0 || REDUCTIONS > -reduction_budget) { REDUCTIONS--; Go = dis_next; goto emulator_loop; } else { goto context_switch; } } while (0) The core of the VM is a bytecode dispatch loop. For example, to call a function 85
3700-line switch statemen // ... OpCase(i_call_f): { SET_CP(c_p, I+2); I = (BeamInstr *) Arg(0)); Dispatch(); } // ... } #define Dispatch() do { dis_next = (BeamInstr *) *I; if (REDUCTIONS > 0 || REDUCTIONS > -reduction_budget) { REDUCTIONS--; Go = dis_next; goto emulator_loop; } else { goto context_switch; } } while (0) The core of the VM is a bytecode dispatch loop. For example, to call a function, (1) set the continuation pointer, (2) advance the instruction pointer (3) call Dispatch() 86
3700-line switch statemen // ... OpCase(i_call_f): { SET_CP(c_p, I+2); I = (BeamInstr *) Arg(0)); Dispatch(); } // ... } #define Dispatch() do { dis_next = (BeamInstr *) *I; if (REDUCTIONS > 0 || REDUCTIONS > -reduction_budget) { REDUCTIONS--; Go = dis_next; goto emulator_loop; } else { goto context_switch; } } while (0) The core of the VM is a bytecode dispatch loop. For example, to call a function, (1) set the continuation pointer, (2) advance the instruction pointer (3) call Dispatch() If we still have reductions, decrement the reduction counter and go through the emulator loop for the next instruction. 87
3700-line switch statemen // ... OpCase(i_call_f): { SET_CP(c_p, I+2); I = (BeamInstr *) Arg(0)); Dispatch(); } // ... } #define Dispatch() do { dis_next = (BeamInstr *) *I; if (REDUCTIONS > 0 || REDUCTIONS > -reduction_budget) { REDUCTIONS--; Go = dis_next; goto emulator_loop; } else { goto context_switch; } } while (0) The core of the VM is a bytecode dispatch loop. For example, to call a function, (1) set the continuation pointer, (2) advance the instruction pointer (3) call Dispatch() If we still have reductions, decrement the reduction counter and go through the emulator loop for the next instruction. Otherwise, context-switch. 88
at safepoints Decisions - Granular priorities vs implementation simplicity - Latency predictability vs baseline overhead Scalable scheduling: not that mysterious! 98
$ curl -o trace.out http://localhost/debug/pprof/trace?seconds/5 # Trace a running program $ go trace trace.out # Run trace viewer go tool trace: Multipurpose program execution tracer Scheduler observability 103