LCOV - code coverage report
Current view: top level - kernel/sched - cputime.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 17 107 15.9 %
Date: 2022-12-09 01:23:36 Functions: 1 12 8.3 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  * Simple CPU accounting cgroup controller
       4             :  */
       5             : 
       6             : #ifdef CONFIG_IRQ_TIME_ACCOUNTING
       7             : 
       8             : /*
       9             :  * There are no locks covering percpu hardirq/softirq time.
      10             :  * They are only modified in vtime_account, on corresponding CPU
      11             :  * with interrupts disabled. So, writes are safe.
      12             :  * They are read and saved off onto struct rq in update_rq_clock().
      13             :  * This may result in other CPU reading this CPU's irq time and can
      14             :  * race with irq/vtime_account on this CPU. We would either get old
      15             :  * or new value with a side effect of accounting a slice of irq time to wrong
      16             :  * task when irq is in progress while we read rq->clock. That is a worthy
      17             :  * compromise in place of having locks on each irq in account_system_time.
      18             :  */
      19             : DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
      20             : 
      21             : static int sched_clock_irqtime;
      22             : 
      23             : void enable_sched_clock_irqtime(void)
      24             : {
      25             :         sched_clock_irqtime = 1;
      26             : }
      27             : 
      28             : void disable_sched_clock_irqtime(void)
      29             : {
      30             :         sched_clock_irqtime = 0;
      31             : }
      32             : 
      33             : static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
      34             :                                   enum cpu_usage_stat idx)
      35             : {
      36             :         u64 *cpustat = kcpustat_this_cpu->cpustat;
      37             : 
      38             :         u64_stats_update_begin(&irqtime->sync);
      39             :         cpustat[idx] += delta;
      40             :         irqtime->total += delta;
      41             :         irqtime->tick_delta += delta;
      42             :         u64_stats_update_end(&irqtime->sync);
      43             : }
      44             : 
      45             : /*
      46             :  * Called after incrementing preempt_count on {soft,}irq_enter
      47             :  * and before decrementing preempt_count on {soft,}irq_exit.
      48             :  */
      49             : void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
      50             : {
      51             :         struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
      52             :         unsigned int pc;
      53             :         s64 delta;
      54             :         int cpu;
      55             : 
      56             :         if (!sched_clock_irqtime)
      57             :                 return;
      58             : 
      59             :         cpu = smp_processor_id();
      60             :         delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
      61             :         irqtime->irq_start_time += delta;
      62             :         pc = irq_count() - offset;
      63             : 
      64             :         /*
      65             :          * We do not account for softirq time from ksoftirqd here.
      66             :          * We want to continue accounting softirq time to ksoftirqd thread
      67             :          * in that case, so as not to confuse scheduler with a special task
      68             :          * that do not consume any time, but still wants to run.
      69             :          */
      70             :         if (pc & HARDIRQ_MASK)
      71             :                 irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
      72             :         else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd())
      73             :                 irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
      74             : }
      75             : 
      76             : static u64 irqtime_tick_accounted(u64 maxtime)
      77             : {
      78             :         struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
      79             :         u64 delta;
      80             : 
      81             :         delta = min(irqtime->tick_delta, maxtime);
      82             :         irqtime->tick_delta -= delta;
      83             : 
      84             :         return delta;
      85             : }
      86             : 
      87             : #else /* CONFIG_IRQ_TIME_ACCOUNTING */
      88             : 
      89             : #define sched_clock_irqtime     (0)
      90             : 
      91             : static u64 irqtime_tick_accounted(u64 dummy)
      92             : {
      93             :         return 0;
      94             : }
      95             : 
      96             : #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
      97             : 
      98             : static inline void task_group_account_field(struct task_struct *p, int index,
      99             :                                             u64 tmp)
     100             : {
     101             :         /*
     102             :          * Since all updates are sure to touch the root cgroup, we
     103             :          * get ourselves ahead and touch it first. If the root cgroup
     104             :          * is the only cgroup, then nothing else should be necessary.
     105             :          *
     106             :          */
     107          12 :         __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
     108             : 
     109          12 :         cgroup_account_cputime_field(p, index, tmp);
     110             : }
     111             : 
     112             : /*
     113             :  * Account user CPU time to a process.
     114             :  * @p: the process that the CPU time gets accounted to
     115             :  * @cputime: the CPU time spent in user space since the last update
     116             :  */
     117           0 : void account_user_time(struct task_struct *p, u64 cputime)
     118             : {
     119             :         int index;
     120             : 
     121             :         /* Add user time to process. */
     122          12 :         p->utime += cputime;
     123          12 :         account_group_user_time(p, cputime);
     124             : 
     125          24 :         index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
     126             : 
     127             :         /* Add user time to cpustat. */
     128          24 :         task_group_account_field(p, index, cputime);
     129             : 
     130             :         /* Account for user time used */
     131          12 :         acct_account_cputime(p);
     132           0 : }
     133             : 
     134             : /*
     135             :  * Account guest CPU time to a process.
     136             :  * @p: the process that the CPU time gets accounted to
     137             :  * @cputime: the CPU time spent in virtual machine since the last update
     138             :  */
     139           0 : void account_guest_time(struct task_struct *p, u64 cputime)
     140             : {
     141           0 :         u64 *cpustat = kcpustat_this_cpu->cpustat;
     142             : 
     143             :         /* Add guest time to process. */
     144           0 :         p->utime += cputime;
     145           0 :         account_group_user_time(p, cputime);
     146           0 :         p->gtime += cputime;
     147             : 
     148             :         /* Add guest time to cpustat. */
     149           0 :         if (task_nice(p) > 0) {
     150           0 :                 task_group_account_field(p, CPUTIME_NICE, cputime);
     151           0 :                 cpustat[CPUTIME_GUEST_NICE] += cputime;
     152             :         } else {
     153           0 :                 task_group_account_field(p, CPUTIME_USER, cputime);
     154           0 :                 cpustat[CPUTIME_GUEST] += cputime;
     155             :         }
     156           0 : }
     157             : 
     158             : /*
     159             :  * Account system CPU time to a process and desired cpustat field
     160             :  * @p: the process that the CPU time gets accounted to
     161             :  * @cputime: the CPU time spent in kernel space since the last update
     162             :  * @index: pointer to cpustat field that has to be updated
     163             :  */
     164           0 : void account_system_index_time(struct task_struct *p,
     165             :                                u64 cputime, enum cpu_usage_stat index)
     166             : {
     167             :         /* Add system time to process. */
     168           0 :         p->stime += cputime;
     169           0 :         account_group_system_time(p, cputime);
     170             : 
     171             :         /* Add system time to cpustat. */
     172           0 :         task_group_account_field(p, index, cputime);
     173             : 
     174             :         /* Account for system time used */
     175           0 :         acct_account_cputime(p);
     176           0 : }
     177             : 
     178             : /*
     179             :  * Account system CPU time to a process.
     180             :  * @p: the process that the CPU time gets accounted to
     181             :  * @hardirq_offset: the offset to subtract from hardirq_count()
     182             :  * @cputime: the CPU time spent in kernel space since the last update
     183             :  */
     184           0 : void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
     185             : {
     186             :         int index;
     187             : 
     188           0 :         if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
     189           0 :                 account_guest_time(p, cputime);
     190           0 :                 return;
     191             :         }
     192             : 
     193           0 :         if (hardirq_count() - hardirq_offset)
     194             :                 index = CPUTIME_IRQ;
     195           0 :         else if (in_serving_softirq())
     196             :                 index = CPUTIME_SOFTIRQ;
     197             :         else
     198           0 :                 index = CPUTIME_SYSTEM;
     199             : 
     200           0 :         account_system_index_time(p, cputime, index);
     201             : }
     202             : 
     203             : /*
     204             :  * Account for involuntary wait time.
     205             :  * @cputime: the CPU time spent in involuntary wait
     206             :  */
     207           0 : void account_steal_time(u64 cputime)
     208             : {
     209           0 :         u64 *cpustat = kcpustat_this_cpu->cpustat;
     210             : 
     211           0 :         cpustat[CPUTIME_STEAL] += cputime;
     212           0 : }
     213             : 
     214             : /*
     215             :  * Account for idle time.
     216             :  * @cputime: the CPU time spent in idle wait
     217             :  */
     218           0 : void account_idle_time(u64 cputime)
     219             : {
     220           1 :         u64 *cpustat = kcpustat_this_cpu->cpustat;
     221           1 :         struct rq *rq = this_rq();
     222             : 
     223           2 :         if (atomic_read(&rq->nr_iowait) > 0)
     224           0 :                 cpustat[CPUTIME_IOWAIT] += cputime;
     225             :         else
     226           1 :                 cpustat[CPUTIME_IDLE] += cputime;
     227           0 : }
     228             : 
     229             : /*
     230             :  * When a guest is interrupted for a longer amount of time, missed clock
     231             :  * ticks are not redelivered later. Due to that, this function may on
     232             :  * occasion account more time than the calling functions think elapsed.
     233             :  */
     234             : static __always_inline u64 steal_account_process_time(u64 maxtime)
     235             : {
     236             : #ifdef CONFIG_PARAVIRT
     237             :         if (static_key_false(&paravirt_steal_enabled)) {
     238             :                 u64 steal;
     239             : 
     240             :                 steal = paravirt_steal_clock(smp_processor_id());
     241             :                 steal -= this_rq()->prev_steal_time;
     242             :                 steal = min(steal, maxtime);
     243             :                 account_steal_time(steal);
     244             :                 this_rq()->prev_steal_time += steal;
     245             : 
     246             :                 return steal;
     247             :         }
     248             : #endif
     249             :         return 0;
     250             : }
     251             : 
     252             : /*
     253             :  * Account how much elapsed time was spent in steal, irq, or softirq time.
     254             :  */
     255             : static inline u64 account_other_time(u64 max)
     256             : {
     257             :         u64 accounted;
     258             : 
     259             :         lockdep_assert_irqs_disabled();
     260             : 
     261             :         accounted = steal_account_process_time(max);
     262             : 
     263             :         if (accounted < max)
     264             :                 accounted += irqtime_tick_accounted(max - accounted);
     265             : 
     266             :         return accounted;
     267             : }
     268             : 
     269             : #ifdef CONFIG_64BIT
     270             : static inline u64 read_sum_exec_runtime(struct task_struct *t)
     271             : {
     272             :         return t->se.sum_exec_runtime;
     273             : }
     274             : #else
     275             : static u64 read_sum_exec_runtime(struct task_struct *t)
     276             : {
     277             :         u64 ns;
     278             :         struct rq_flags rf;
     279             :         struct rq *rq;
     280             : 
     281             :         rq = task_rq_lock(t, &rf);
     282             :         ns = t->se.sum_exec_runtime;
     283             :         task_rq_unlock(rq, t, &rf);
     284             : 
     285             :         return ns;
     286             : }
     287             : #endif
     288             : 
     289             : /*
     290             :  * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
     291             :  * tasks (sum on group iteration) belonging to @tsk's group.
     292             :  */
     293           0 : void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
     294             : {
     295           0 :         struct signal_struct *sig = tsk->signal;
     296             :         u64 utime, stime;
     297             :         struct task_struct *t;
     298             :         unsigned int seq, nextseq;
     299             :         unsigned long flags;
     300             : 
     301             :         /*
     302             :          * Update current task runtime to account pending time since last
     303             :          * scheduler action or thread_group_cputime() call. This thread group
     304             :          * might have other running tasks on different CPUs, but updating
     305             :          * their runtime can affect syscall performance, so we skip account
     306             :          * those pending times and rely only on values updated on tick or
     307             :          * other scheduler action.
     308             :          */
     309           0 :         if (same_thread_group(current, tsk))
     310           0 :                 (void) task_sched_runtime(current);
     311             : 
     312             :         rcu_read_lock();
     313             :         /* Attempt a lockless read on the first round. */
     314           0 :         nextseq = 0;
     315             :         do {
     316           0 :                 seq = nextseq;
     317           0 :                 flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
     318           0 :                 times->utime = sig->utime;
     319           0 :                 times->stime = sig->stime;
     320           0 :                 times->sum_exec_runtime = sig->sum_sched_runtime;
     321             : 
     322           0 :                 for_each_thread(tsk, t) {
     323           0 :                         task_cputime(t, &utime, &stime);
     324           0 :                         times->utime += utime;
     325           0 :                         times->stime += stime;
     326           0 :                         times->sum_exec_runtime += read_sum_exec_runtime(t);
     327             :                 }
     328             :                 /* If lockless access failed, take the lock. */
     329           0 :                 nextseq = 1;
     330           0 :         } while (need_seqretry(&sig->stats_lock, seq));
     331           0 :         done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
     332             :         rcu_read_unlock();
     333           0 : }
     334             : 
     335             : #ifdef CONFIG_IRQ_TIME_ACCOUNTING
     336             : /*
     337             :  * Account a tick to a process and cpustat
     338             :  * @p: the process that the CPU time gets accounted to
     339             :  * @user_tick: is the tick from userspace
     340             :  * @rq: the pointer to rq
     341             :  *
     342             :  * Tick demultiplexing follows the order
     343             :  * - pending hardirq update
     344             :  * - pending softirq update
     345             :  * - user_time
     346             :  * - idle_time
     347             :  * - system time
     348             :  *   - check for guest_time
     349             :  *   - else account as system_time
     350             :  *
     351             :  * Check for hardirq is done both for system and user time as there is
     352             :  * no timer going off while we are on hardirq and hence we may never get an
     353             :  * opportunity to update it solely in system time.
     354             :  * p->stime and friends are only updated on system time and not on irq
     355             :  * softirq as those do not count in task exec_runtime any more.
     356             :  */
     357             : static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
     358             :                                          int ticks)
     359             : {
     360             :         u64 other, cputime = TICK_NSEC * ticks;
     361             : 
     362             :         /*
     363             :          * When returning from idle, many ticks can get accounted at
     364             :          * once, including some ticks of steal, irq, and softirq time.
     365             :          * Subtract those ticks from the amount of time accounted to
     366             :          * idle, or potentially user or system time. Due to rounding,
     367             :          * other time can exceed ticks occasionally.
     368             :          */
     369             :         other = account_other_time(ULONG_MAX);
     370             :         if (other >= cputime)
     371             :                 return;
     372             : 
     373             :         cputime -= other;
     374             : 
     375             :         if (this_cpu_ksoftirqd() == p) {
     376             :                 /*
     377             :                  * ksoftirqd time do not get accounted in cpu_softirq_time.
     378             :                  * So, we have to handle it separately here.
     379             :                  * Also, p->stime needs to be updated for ksoftirqd.
     380             :                  */
     381             :                 account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
     382             :         } else if (user_tick) {
     383             :                 account_user_time(p, cputime);
     384             :         } else if (p == this_rq()->idle) {
     385             :                 account_idle_time(cputime);
     386             :         } else if (p->flags & PF_VCPU) { /* System time or guest time */
     387             :                 account_guest_time(p, cputime);
     388             :         } else {
     389             :                 account_system_index_time(p, cputime, CPUTIME_SYSTEM);
     390             :         }
     391             : }
     392             : 
     393             : static void irqtime_account_idle_ticks(int ticks)
     394             : {
     395             :         irqtime_account_process_tick(current, 0, ticks);
     396             : }
     397             : #else /* CONFIG_IRQ_TIME_ACCOUNTING */
     398             : static inline void irqtime_account_idle_ticks(int ticks) { }
     399             : static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
     400             :                                                 int nr_ticks) { }
     401             : #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
     402             : 
     403             : /*
     404             :  * Use precise platform statistics if available:
     405             :  */
     406             : #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
     407             : 
     408             : # ifndef __ARCH_HAS_VTIME_TASK_SWITCH
     409             : void vtime_task_switch(struct task_struct *prev)
     410             : {
     411             :         if (is_idle_task(prev))
     412             :                 vtime_account_idle(prev);
     413             :         else
     414             :                 vtime_account_kernel(prev);
     415             : 
     416             :         vtime_flush(prev);
     417             :         arch_vtime_task_switch(prev);
     418             : }
     419             : # endif
     420             : 
     421             : void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
     422             : {
     423             :         unsigned int pc = irq_count() - offset;
     424             : 
     425             :         if (pc & HARDIRQ_OFFSET) {
     426             :                 vtime_account_hardirq(tsk);
     427             :         } else if (pc & SOFTIRQ_OFFSET) {
     428             :                 vtime_account_softirq(tsk);
     429             :         } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
     430             :                    is_idle_task(tsk)) {
     431             :                 vtime_account_idle(tsk);
     432             :         } else {
     433             :                 vtime_account_kernel(tsk);
     434             :         }
     435             : }
     436             : 
     437             : void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
     438             :                     u64 *ut, u64 *st)
     439             : {
     440             :         *ut = curr->utime;
     441             :         *st = curr->stime;
     442             : }
     443             : 
     444             : void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
     445             : {
     446             :         *ut = p->utime;
     447             :         *st = p->stime;
     448             : }
     449             : EXPORT_SYMBOL_GPL(task_cputime_adjusted);
     450             : 
     451             : void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
     452             : {
     453             :         struct task_cputime cputime;
     454             : 
     455             :         thread_group_cputime(p, &cputime);
     456             : 
     457             :         *ut = cputime.utime;
     458             :         *st = cputime.stime;
     459             : }
     460             : 
     461             : #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
     462             : 
     463             : /*
     464             :  * Account a single tick of CPU time.
     465             :  * @p: the process that the CPU time gets accounted to
     466             :  * @user_tick: indicates if the tick is a user or a system tick
     467             :  */
     468          13 : void account_process_tick(struct task_struct *p, int user_tick)
     469             : {
     470             :         u64 cputime, steal;
     471             : 
     472             :         if (vtime_accounting_enabled_this_cpu())
     473             :                 return;
     474             : 
     475             :         if (sched_clock_irqtime) {
     476             :                 irqtime_account_process_tick(p, user_tick, 1);
     477             :                 return;
     478             :         }
     479             : 
     480          13 :         cputime = TICK_NSEC;
     481          13 :         steal = steal_account_process_time(ULONG_MAX);
     482             : 
     483             :         if (steal >= cputime)
     484             :                 return;
     485             : 
     486          13 :         cputime -= steal;
     487             : 
     488          13 :         if (user_tick)
     489             :                 account_user_time(p, cputime);
     490           2 :         else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET))
     491           0 :                 account_system_time(p, HARDIRQ_OFFSET, cputime);
     492             :         else
     493             :                 account_idle_time(cputime);
     494             : }
     495             : 
     496             : /*
     497             :  * Account multiple ticks of idle time.
     498             :  * @ticks: number of stolen ticks
     499             :  */
     500           0 : void account_idle_ticks(unsigned long ticks)
     501             : {
     502             :         u64 cputime, steal;
     503             : 
     504             :         if (sched_clock_irqtime) {
     505             :                 irqtime_account_idle_ticks(ticks);
     506             :                 return;
     507             :         }
     508             : 
     509           0 :         cputime = ticks * TICK_NSEC;
     510           0 :         steal = steal_account_process_time(ULONG_MAX);
     511             : 
     512           0 :         if (steal >= cputime)
     513             :                 return;
     514             : 
     515           0 :         cputime -= steal;
     516             :         account_idle_time(cputime);
     517             : }
     518             : 
     519             : /*
     520             :  * Adjust tick based cputime random precision against scheduler runtime
     521             :  * accounting.
     522             :  *
     523             :  * Tick based cputime accounting depend on random scheduling timeslices of a
     524             :  * task to be interrupted or not by the timer.  Depending on these
     525             :  * circumstances, the number of these interrupts may be over or
     526             :  * under-optimistic, matching the real user and system cputime with a variable
     527             :  * precision.
     528             :  *
     529             :  * Fix this by scaling these tick based values against the total runtime
     530             :  * accounted by the CFS scheduler.
     531             :  *
     532             :  * This code provides the following guarantees:
     533             :  *
     534             :  *   stime + utime == rtime
     535             :  *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
     536             :  *
     537             :  * Assuming that rtime_i+1 >= rtime_i.
     538             :  */
     539           0 : void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
     540             :                     u64 *ut, u64 *st)
     541             : {
     542             :         u64 rtime, stime, utime;
     543             :         unsigned long flags;
     544             : 
     545             :         /* Serialize concurrent callers such that we can honour our guarantees */
     546           0 :         raw_spin_lock_irqsave(&prev->lock, flags);
     547           0 :         rtime = curr->sum_exec_runtime;
     548             : 
     549             :         /*
     550             :          * This is possible under two circumstances:
     551             :          *  - rtime isn't monotonic after all (a bug);
     552             :          *  - we got reordered by the lock.
     553             :          *
     554             :          * In both cases this acts as a filter such that the rest of the code
     555             :          * can assume it is monotonic regardless of anything else.
     556             :          */
     557           0 :         if (prev->stime + prev->utime >= rtime)
     558             :                 goto out;
     559             : 
     560           0 :         stime = curr->stime;
     561           0 :         utime = curr->utime;
     562             : 
     563             :         /*
     564             :          * If either stime or utime are 0, assume all runtime is userspace.
     565             :          * Once a task gets some ticks, the monotonicity code at 'update:'
     566             :          * will ensure things converge to the observed ratio.
     567             :          */
     568           0 :         if (stime == 0) {
     569             :                 utime = rtime;
     570             :                 goto update;
     571             :         }
     572             : 
     573           0 :         if (utime == 0) {
     574             :                 stime = rtime;
     575             :                 goto update;
     576             :         }
     577             : 
     578           0 :         stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
     579             : 
     580             : update:
     581             :         /*
     582             :          * Make sure stime doesn't go backwards; this preserves monotonicity
     583             :          * for utime because rtime is monotonic.
     584             :          *
     585             :          *  utime_i+1 = rtime_i+1 - stime_i
     586             :          *            = rtime_i+1 - (rtime_i - utime_i)
     587             :          *            = (rtime_i+1 - rtime_i) + utime_i
     588             :          *            >= utime_i
     589             :          */
     590           0 :         if (stime < prev->stime)
     591           0 :                 stime = prev->stime;
     592           0 :         utime = rtime - stime;
     593             : 
     594             :         /*
     595             :          * Make sure utime doesn't go backwards; this still preserves
     596             :          * monotonicity for stime, analogous argument to above.
     597             :          */
     598           0 :         if (utime < prev->utime) {
     599           0 :                 utime = prev->utime;
     600           0 :                 stime = rtime - utime;
     601             :         }
     602             : 
     603           0 :         prev->stime = stime;
     604           0 :         prev->utime = utime;
     605             : out:
     606           0 :         *ut = prev->utime;
     607           0 :         *st = prev->stime;
     608           0 :         raw_spin_unlock_irqrestore(&prev->lock, flags);
     609           0 : }
     610             : 
     611           0 : void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
     612             : {
     613           0 :         struct task_cputime cputime = {
     614           0 :                 .sum_exec_runtime = p->se.sum_exec_runtime,
     615             :         };
     616             : 
     617           0 :         if (task_cputime(p, &cputime.utime, &cputime.stime))
     618             :                 cputime.sum_exec_runtime = task_sched_runtime(p);
     619           0 :         cputime_adjust(&cputime, &p->prev_cputime, ut, st);
     620           0 : }
     621             : EXPORT_SYMBOL_GPL(task_cputime_adjusted);
     622             : 
     623           0 : void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
     624             : {
     625             :         struct task_cputime cputime;
     626             : 
     627           0 :         thread_group_cputime(p, &cputime);
     628           0 :         cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
     629           0 : }
     630             : #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
     631             : 
     632             : #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
     633             : static u64 vtime_delta(struct vtime *vtime)
     634             : {
     635             :         unsigned long long clock;
     636             : 
     637             :         clock = sched_clock();
     638             :         if (clock < vtime->starttime)
     639             :                 return 0;
     640             : 
     641             :         return clock - vtime->starttime;
     642             : }
     643             : 
     644             : static u64 get_vtime_delta(struct vtime *vtime)
     645             : {
     646             :         u64 delta = vtime_delta(vtime);
     647             :         u64 other;
     648             : 
     649             :         /*
     650             :          * Unlike tick based timing, vtime based timing never has lost
     651             :          * ticks, and no need for steal time accounting to make up for
     652             :          * lost ticks. Vtime accounts a rounded version of actual
     653             :          * elapsed time. Limit account_other_time to prevent rounding
     654             :          * errors from causing elapsed vtime to go negative.
     655             :          */
     656             :         other = account_other_time(delta);
     657             :         WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
     658             :         vtime->starttime += delta;
     659             : 
     660             :         return delta - other;
     661             : }
     662             : 
     663             : static void vtime_account_system(struct task_struct *tsk,
     664             :                                  struct vtime *vtime)
     665             : {
     666             :         vtime->stime += get_vtime_delta(vtime);
     667             :         if (vtime->stime >= TICK_NSEC) {
     668             :                 account_system_time(tsk, irq_count(), vtime->stime);
     669             :                 vtime->stime = 0;
     670             :         }
     671             : }
     672             : 
     673             : static void vtime_account_guest(struct task_struct *tsk,
     674             :                                 struct vtime *vtime)
     675             : {
     676             :         vtime->gtime += get_vtime_delta(vtime);
     677             :         if (vtime->gtime >= TICK_NSEC) {
     678             :                 account_guest_time(tsk, vtime->gtime);
     679             :                 vtime->gtime = 0;
     680             :         }
     681             : }
     682             : 
     683             : static void __vtime_account_kernel(struct task_struct *tsk,
     684             :                                    struct vtime *vtime)
     685             : {
     686             :         /* We might have scheduled out from guest path */
     687             :         if (vtime->state == VTIME_GUEST)
     688             :                 vtime_account_guest(tsk, vtime);
     689             :         else
     690             :                 vtime_account_system(tsk, vtime);
     691             : }
     692             : 
     693             : void vtime_account_kernel(struct task_struct *tsk)
     694             : {
     695             :         struct vtime *vtime = &tsk->vtime;
     696             : 
     697             :         if (!vtime_delta(vtime))
     698             :                 return;
     699             : 
     700             :         write_seqcount_begin(&vtime->seqcount);
     701             :         __vtime_account_kernel(tsk, vtime);
     702             :         write_seqcount_end(&vtime->seqcount);
     703             : }
     704             : 
     705             : void vtime_user_enter(struct task_struct *tsk)
     706             : {
     707             :         struct vtime *vtime = &tsk->vtime;
     708             : 
     709             :         write_seqcount_begin(&vtime->seqcount);
     710             :         vtime_account_system(tsk, vtime);
     711             :         vtime->state = VTIME_USER;
     712             :         write_seqcount_end(&vtime->seqcount);
     713             : }
     714             : 
     715             : void vtime_user_exit(struct task_struct *tsk)
     716             : {
     717             :         struct vtime *vtime = &tsk->vtime;
     718             : 
     719             :         write_seqcount_begin(&vtime->seqcount);
     720             :         vtime->utime += get_vtime_delta(vtime);
     721             :         if (vtime->utime >= TICK_NSEC) {
     722             :                 account_user_time(tsk, vtime->utime);
     723             :                 vtime->utime = 0;
     724             :         }
     725             :         vtime->state = VTIME_SYS;
     726             :         write_seqcount_end(&vtime->seqcount);
     727             : }
     728             : 
     729             : void vtime_guest_enter(struct task_struct *tsk)
     730             : {
     731             :         struct vtime *vtime = &tsk->vtime;
     732             :         /*
     733             :          * The flags must be updated under the lock with
     734             :          * the vtime_starttime flush and update.
     735             :          * That enforces a right ordering and update sequence
     736             :          * synchronization against the reader (task_gtime())
     737             :          * that can thus safely catch up with a tickless delta.
     738             :          */
     739             :         write_seqcount_begin(&vtime->seqcount);
     740             :         vtime_account_system(tsk, vtime);
     741             :         tsk->flags |= PF_VCPU;
     742             :         vtime->state = VTIME_GUEST;
     743             :         write_seqcount_end(&vtime->seqcount);
     744             : }
     745             : EXPORT_SYMBOL_GPL(vtime_guest_enter);
     746             : 
     747             : void vtime_guest_exit(struct task_struct *tsk)
     748             : {
     749             :         struct vtime *vtime = &tsk->vtime;
     750             : 
     751             :         write_seqcount_begin(&vtime->seqcount);
     752             :         vtime_account_guest(tsk, vtime);
     753             :         tsk->flags &= ~PF_VCPU;
     754             :         vtime->state = VTIME_SYS;
     755             :         write_seqcount_end(&vtime->seqcount);
     756             : }
     757             : EXPORT_SYMBOL_GPL(vtime_guest_exit);
     758             : 
     759             : void vtime_account_idle(struct task_struct *tsk)
     760             : {
     761             :         account_idle_time(get_vtime_delta(&tsk->vtime));
     762             : }
     763             : 
     764             : void vtime_task_switch_generic(struct task_struct *prev)
     765             : {
     766             :         struct vtime *vtime = &prev->vtime;
     767             : 
     768             :         write_seqcount_begin(&vtime->seqcount);
     769             :         if (vtime->state == VTIME_IDLE)
     770             :                 vtime_account_idle(prev);
     771             :         else
     772             :                 __vtime_account_kernel(prev, vtime);
     773             :         vtime->state = VTIME_INACTIVE;
     774             :         vtime->cpu = -1;
     775             :         write_seqcount_end(&vtime->seqcount);
     776             : 
     777             :         vtime = &current->vtime;
     778             : 
     779             :         write_seqcount_begin(&vtime->seqcount);
     780             :         if (is_idle_task(current))
     781             :                 vtime->state = VTIME_IDLE;
     782             :         else if (current->flags & PF_VCPU)
     783             :                 vtime->state = VTIME_GUEST;
     784             :         else
     785             :                 vtime->state = VTIME_SYS;
     786             :         vtime->starttime = sched_clock();
     787             :         vtime->cpu = smp_processor_id();
     788             :         write_seqcount_end(&vtime->seqcount);
     789             : }
     790             : 
     791             : void vtime_init_idle(struct task_struct *t, int cpu)
     792             : {
     793             :         struct vtime *vtime = &t->vtime;
     794             :         unsigned long flags;
     795             : 
     796             :         local_irq_save(flags);
     797             :         write_seqcount_begin(&vtime->seqcount);
     798             :         vtime->state = VTIME_IDLE;
     799             :         vtime->starttime = sched_clock();
     800             :         vtime->cpu = cpu;
     801             :         write_seqcount_end(&vtime->seqcount);
     802             :         local_irq_restore(flags);
     803             : }
     804             : 
     805             : u64 task_gtime(struct task_struct *t)
     806             : {
     807             :         struct vtime *vtime = &t->vtime;
     808             :         unsigned int seq;
     809             :         u64 gtime;
     810             : 
     811             :         if (!vtime_accounting_enabled())
     812             :                 return t->gtime;
     813             : 
     814             :         do {
     815             :                 seq = read_seqcount_begin(&vtime->seqcount);
     816             : 
     817             :                 gtime = t->gtime;
     818             :                 if (vtime->state == VTIME_GUEST)
     819             :                         gtime += vtime->gtime + vtime_delta(vtime);
     820             : 
     821             :         } while (read_seqcount_retry(&vtime->seqcount, seq));
     822             : 
     823             :         return gtime;
     824             : }
     825             : 
     826             : /*
     827             :  * Fetch cputime raw values from fields of task_struct and
     828             :  * add up the pending nohz execution time since the last
     829             :  * cputime snapshot.
     830             :  */
     831             : bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
     832             : {
     833             :         struct vtime *vtime = &t->vtime;
     834             :         unsigned int seq;
     835             :         u64 delta;
     836             :         int ret;
     837             : 
     838             :         if (!vtime_accounting_enabled()) {
     839             :                 *utime = t->utime;
     840             :                 *stime = t->stime;
     841             :                 return false;
     842             :         }
     843             : 
     844             :         do {
     845             :                 ret = false;
     846             :                 seq = read_seqcount_begin(&vtime->seqcount);
     847             : 
     848             :                 *utime = t->utime;
     849             :                 *stime = t->stime;
     850             : 
     851             :                 /* Task is sleeping or idle, nothing to add */
     852             :                 if (vtime->state < VTIME_SYS)
     853             :                         continue;
     854             : 
     855             :                 ret = true;
     856             :                 delta = vtime_delta(vtime);
     857             : 
     858             :                 /*
     859             :                  * Task runs either in user (including guest) or kernel space,
     860             :                  * add pending nohz time to the right place.
     861             :                  */
     862             :                 if (vtime->state == VTIME_SYS)
     863             :                         *stime += vtime->stime + delta;
     864             :                 else
     865             :                         *utime += vtime->utime + delta;
     866             :         } while (read_seqcount_retry(&vtime->seqcount, seq));
     867             : 
     868             :         return ret;
     869             : }
     870             : 
     871             : static int vtime_state_fetch(struct vtime *vtime, int cpu)
     872             : {
     873             :         int state = READ_ONCE(vtime->state);
     874             : 
     875             :         /*
     876             :          * We raced against a context switch, fetch the
     877             :          * kcpustat task again.
     878             :          */
     879             :         if (vtime->cpu != cpu && vtime->cpu != -1)
     880             :                 return -EAGAIN;
     881             : 
     882             :         /*
     883             :          * Two possible things here:
     884             :          * 1) We are seeing the scheduling out task (prev) or any past one.
     885             :          * 2) We are seeing the scheduling in task (next) but it hasn't
     886             :          *    passed though vtime_task_switch() yet so the pending
     887             :          *    cputime of the prev task may not be flushed yet.
     888             :          *
     889             :          * Case 1) is ok but 2) is not. So wait for a safe VTIME state.
     890             :          */
     891             :         if (state == VTIME_INACTIVE)
     892             :                 return -EAGAIN;
     893             : 
     894             :         return state;
     895             : }
     896             : 
     897             : static u64 kcpustat_user_vtime(struct vtime *vtime)
     898             : {
     899             :         if (vtime->state == VTIME_USER)
     900             :                 return vtime->utime + vtime_delta(vtime);
     901             :         else if (vtime->state == VTIME_GUEST)
     902             :                 return vtime->gtime + vtime_delta(vtime);
     903             :         return 0;
     904             : }
     905             : 
     906             : static int kcpustat_field_vtime(u64 *cpustat,
     907             :                                 struct task_struct *tsk,
     908             :                                 enum cpu_usage_stat usage,
     909             :                                 int cpu, u64 *val)
     910             : {
     911             :         struct vtime *vtime = &tsk->vtime;
     912             :         unsigned int seq;
     913             : 
     914             :         do {
     915             :                 int state;
     916             : 
     917             :                 seq = read_seqcount_begin(&vtime->seqcount);
     918             : 
     919             :                 state = vtime_state_fetch(vtime, cpu);
     920             :                 if (state < 0)
     921             :                         return state;
     922             : 
     923             :                 *val = cpustat[usage];
     924             : 
     925             :                 /*
     926             :                  * Nice VS unnice cputime accounting may be inaccurate if
     927             :                  * the nice value has changed since the last vtime update.
     928             :                  * But proper fix would involve interrupting target on nice
     929             :                  * updates which is a no go on nohz_full (although the scheduler
     930             :                  * may still interrupt the target if rescheduling is needed...)
     931             :                  */
     932             :                 switch (usage) {
     933             :                 case CPUTIME_SYSTEM:
     934             :                         if (state == VTIME_SYS)
     935             :                                 *val += vtime->stime + vtime_delta(vtime);
     936             :                         break;
     937             :                 case CPUTIME_USER:
     938             :                         if (task_nice(tsk) <= 0)
     939             :                                 *val += kcpustat_user_vtime(vtime);
     940             :                         break;
     941             :                 case CPUTIME_NICE:
     942             :                         if (task_nice(tsk) > 0)
     943             :                                 *val += kcpustat_user_vtime(vtime);
     944             :                         break;
     945             :                 case CPUTIME_GUEST:
     946             :                         if (state == VTIME_GUEST && task_nice(tsk) <= 0)
     947             :                                 *val += vtime->gtime + vtime_delta(vtime);
     948             :                         break;
     949             :                 case CPUTIME_GUEST_NICE:
     950             :                         if (state == VTIME_GUEST && task_nice(tsk) > 0)
     951             :                                 *val += vtime->gtime + vtime_delta(vtime);
     952             :                         break;
     953             :                 default:
     954             :                         break;
     955             :                 }
     956             :         } while (read_seqcount_retry(&vtime->seqcount, seq));
     957             : 
     958             :         return 0;
     959             : }
     960             : 
     961             : u64 kcpustat_field(struct kernel_cpustat *kcpustat,
     962             :                    enum cpu_usage_stat usage, int cpu)
     963             : {
     964             :         u64 *cpustat = kcpustat->cpustat;
     965             :         u64 val = cpustat[usage];
     966             :         struct rq *rq;
     967             :         int err;
     968             : 
     969             :         if (!vtime_accounting_enabled_cpu(cpu))
     970             :                 return val;
     971             : 
     972             :         rq = cpu_rq(cpu);
     973             : 
     974             :         for (;;) {
     975             :                 struct task_struct *curr;
     976             : 
     977             :                 rcu_read_lock();
     978             :                 curr = rcu_dereference(rq->curr);
     979             :                 if (WARN_ON_ONCE(!curr)) {
     980             :                         rcu_read_unlock();
     981             :                         return cpustat[usage];
     982             :                 }
     983             : 
     984             :                 err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val);
     985             :                 rcu_read_unlock();
     986             : 
     987             :                 if (!err)
     988             :                         return val;
     989             : 
     990             :                 cpu_relax();
     991             :         }
     992             : }
     993             : EXPORT_SYMBOL_GPL(kcpustat_field);
     994             : 
     995             : static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
     996             :                                     const struct kernel_cpustat *src,
     997             :                                     struct task_struct *tsk, int cpu)
     998             : {
     999             :         struct vtime *vtime = &tsk->vtime;
    1000             :         unsigned int seq;
    1001             : 
    1002             :         do {
    1003             :                 u64 *cpustat;
    1004             :                 u64 delta;
    1005             :                 int state;
    1006             : 
    1007             :                 seq = read_seqcount_begin(&vtime->seqcount);
    1008             : 
    1009             :                 state = vtime_state_fetch(vtime, cpu);
    1010             :                 if (state < 0)
    1011             :                         return state;
    1012             : 
    1013             :                 *dst = *src;
    1014             :                 cpustat = dst->cpustat;
    1015             : 
    1016             :                 /* Task is sleeping, dead or idle, nothing to add */
    1017             :                 if (state < VTIME_SYS)
    1018             :                         continue;
    1019             : 
    1020             :                 delta = vtime_delta(vtime);
    1021             : 
    1022             :                 /*
    1023             :                  * Task runs either in user (including guest) or kernel space,
    1024             :                  * add pending nohz time to the right place.
    1025             :                  */
    1026             :                 if (state == VTIME_SYS) {
    1027             :                         cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
    1028             :                 } else if (state == VTIME_USER) {
    1029             :                         if (task_nice(tsk) > 0)
    1030             :                                 cpustat[CPUTIME_NICE] += vtime->utime + delta;
    1031             :                         else
    1032             :                                 cpustat[CPUTIME_USER] += vtime->utime + delta;
    1033             :                 } else {
    1034             :                         WARN_ON_ONCE(state != VTIME_GUEST);
    1035             :                         if (task_nice(tsk) > 0) {
    1036             :                                 cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
    1037             :                                 cpustat[CPUTIME_NICE] += vtime->gtime + delta;
    1038             :                         } else {
    1039             :                                 cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
    1040             :                                 cpustat[CPUTIME_USER] += vtime->gtime + delta;
    1041             :                         }
    1042             :                 }
    1043             :         } while (read_seqcount_retry(&vtime->seqcount, seq));
    1044             : 
    1045             :         return 0;
    1046             : }
    1047             : 
    1048             : void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
    1049             : {
    1050             :         const struct kernel_cpustat *src = &kcpustat_cpu(cpu);
    1051             :         struct rq *rq;
    1052             :         int err;
    1053             : 
    1054             :         if (!vtime_accounting_enabled_cpu(cpu)) {
    1055             :                 *dst = *src;
    1056             :                 return;
    1057             :         }
    1058             : 
    1059             :         rq = cpu_rq(cpu);
    1060             : 
    1061             :         for (;;) {
    1062             :                 struct task_struct *curr;
    1063             : 
    1064             :                 rcu_read_lock();
    1065             :                 curr = rcu_dereference(rq->curr);
    1066             :                 if (WARN_ON_ONCE(!curr)) {
    1067             :                         rcu_read_unlock();
    1068             :                         *dst = *src;
    1069             :                         return;
    1070             :                 }
    1071             : 
    1072             :                 err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu);
    1073             :                 rcu_read_unlock();
    1074             : 
    1075             :                 if (!err)
    1076             :                         return;
    1077             : 
    1078             :                 cpu_relax();
    1079             :         }
    1080             : }
    1081             : EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch);
    1082             : 
    1083             : #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */

Generated by: LCOV version 1.14