LCOV - code coverage report
Current view: top level - mm - backing-dev.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 46 197 23.4 %
Date: 2022-12-09 01:23:36 Functions: 4 26 15.4 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : 
       3             : #include <linux/wait.h>
       4             : #include <linux/rbtree.h>
       5             : #include <linux/kthread.h>
       6             : #include <linux/backing-dev.h>
       7             : #include <linux/blk-cgroup.h>
       8             : #include <linux/freezer.h>
       9             : #include <linux/fs.h>
      10             : #include <linux/pagemap.h>
      11             : #include <linux/mm.h>
      12             : #include <linux/sched/mm.h>
      13             : #include <linux/sched.h>
      14             : #include <linux/module.h>
      15             : #include <linux/writeback.h>
      16             : #include <linux/device.h>
      17             : #include <trace/events/writeback.h>
      18             : 
      19             : struct backing_dev_info noop_backing_dev_info;
      20             : EXPORT_SYMBOL_GPL(noop_backing_dev_info);
      21             : 
      22             : static struct class *bdi_class;
      23             : static const char *bdi_unknown_name = "(unknown)";
      24             : 
      25             : /*
      26             :  * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
      27             :  * reader side locking.
      28             :  */
      29             : DEFINE_SPINLOCK(bdi_lock);
      30             : static u64 bdi_id_cursor;
      31             : static struct rb_root bdi_tree = RB_ROOT;
      32             : LIST_HEAD(bdi_list);
      33             : 
      34             : /* bdi_wq serves all asynchronous writeback tasks */
      35             : struct workqueue_struct *bdi_wq;
      36             : 
      37             : #define K(x) ((x) << (PAGE_SHIFT - 10))
      38             : 
      39             : #ifdef CONFIG_DEBUG_FS
      40             : #include <linux/debugfs.h>
      41             : #include <linux/seq_file.h>
      42             : 
      43             : static struct dentry *bdi_debug_root;
      44             : 
      45             : static void bdi_debug_init(void)
      46             : {
      47             :         bdi_debug_root = debugfs_create_dir("bdi", NULL);
      48             : }
      49             : 
      50             : static int bdi_debug_stats_show(struct seq_file *m, void *v)
      51             : {
      52             :         struct backing_dev_info *bdi = m->private;
      53             :         struct bdi_writeback *wb = &bdi->wb;
      54             :         unsigned long background_thresh;
      55             :         unsigned long dirty_thresh;
      56             :         unsigned long wb_thresh;
      57             :         unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
      58             :         struct inode *inode;
      59             : 
      60             :         nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
      61             :         spin_lock(&wb->list_lock);
      62             :         list_for_each_entry(inode, &wb->b_dirty, i_io_list)
      63             :                 nr_dirty++;
      64             :         list_for_each_entry(inode, &wb->b_io, i_io_list)
      65             :                 nr_io++;
      66             :         list_for_each_entry(inode, &wb->b_more_io, i_io_list)
      67             :                 nr_more_io++;
      68             :         list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
      69             :                 if (inode->i_state & I_DIRTY_TIME)
      70             :                         nr_dirty_time++;
      71             :         spin_unlock(&wb->list_lock);
      72             : 
      73             :         global_dirty_limits(&background_thresh, &dirty_thresh);
      74             :         wb_thresh = wb_calc_thresh(wb, dirty_thresh);
      75             : 
      76             :         seq_printf(m,
      77             :                    "BdiWriteback:       %10lu kB\n"
      78             :                    "BdiReclaimable:     %10lu kB\n"
      79             :                    "BdiDirtyThresh:     %10lu kB\n"
      80             :                    "DirtyThresh:        %10lu kB\n"
      81             :                    "BackgroundThresh:   %10lu kB\n"
      82             :                    "BdiDirtied:         %10lu kB\n"
      83             :                    "BdiWritten:         %10lu kB\n"
      84             :                    "BdiWriteBandwidth:  %10lu kBps\n"
      85             :                    "b_dirty:            %10lu\n"
      86             :                    "b_io:               %10lu\n"
      87             :                    "b_more_io:          %10lu\n"
      88             :                    "b_dirty_time:       %10lu\n"
      89             :                    "bdi_list:           %10u\n"
      90             :                    "state:              %10lx\n",
      91             :                    (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
      92             :                    (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
      93             :                    K(wb_thresh),
      94             :                    K(dirty_thresh),
      95             :                    K(background_thresh),
      96             :                    (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
      97             :                    (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
      98             :                    (unsigned long) K(wb->write_bandwidth),
      99             :                    nr_dirty,
     100             :                    nr_io,
     101             :                    nr_more_io,
     102             :                    nr_dirty_time,
     103             :                    !list_empty(&bdi->bdi_list), bdi->wb.state);
     104             : 
     105             :         return 0;
     106             : }
     107             : DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);
     108             : 
     109             : static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
     110             : {
     111             :         bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
     112             : 
     113             :         debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
     114             :                             &bdi_debug_stats_fops);
     115             : }
     116             : 
     117             : static void bdi_debug_unregister(struct backing_dev_info *bdi)
     118             : {
     119             :         debugfs_remove_recursive(bdi->debug_dir);
     120             : }
     121             : #else
     122             : static inline void bdi_debug_init(void)
     123             : {
     124             : }
     125             : static inline void bdi_debug_register(struct backing_dev_info *bdi,
     126             :                                       const char *name)
     127             : {
     128             : }
     129             : static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
     130             : {
     131             : }
     132             : #endif
     133             : 
     134           0 : static ssize_t read_ahead_kb_store(struct device *dev,
     135             :                                   struct device_attribute *attr,
     136             :                                   const char *buf, size_t count)
     137             : {
     138           0 :         struct backing_dev_info *bdi = dev_get_drvdata(dev);
     139             :         unsigned long read_ahead_kb;
     140             :         ssize_t ret;
     141             : 
     142           0 :         ret = kstrtoul(buf, 10, &read_ahead_kb);
     143           0 :         if (ret < 0)
     144             :                 return ret;
     145             : 
     146           0 :         bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
     147             : 
     148           0 :         return count;
     149             : }
     150             : 
     151             : #define BDI_SHOW(name, expr)                                            \
     152             : static ssize_t name##_show(struct device *dev,                          \
     153             :                            struct device_attribute *attr, char *buf)    \
     154             : {                                                                       \
     155             :         struct backing_dev_info *bdi = dev_get_drvdata(dev);            \
     156             :                                                                         \
     157             :         return sysfs_emit(buf, "%lld\n", (long long)expr);            \
     158             : }                                                                       \
     159             : static DEVICE_ATTR_RW(name);
     160             : 
     161           0 : BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
     162             : 
     163           0 : static ssize_t min_ratio_store(struct device *dev,
     164             :                 struct device_attribute *attr, const char *buf, size_t count)
     165             : {
     166           0 :         struct backing_dev_info *bdi = dev_get_drvdata(dev);
     167             :         unsigned int ratio;
     168             :         ssize_t ret;
     169             : 
     170           0 :         ret = kstrtouint(buf, 10, &ratio);
     171           0 :         if (ret < 0)
     172             :                 return ret;
     173             : 
     174           0 :         ret = bdi_set_min_ratio(bdi, ratio);
     175           0 :         if (!ret)
     176           0 :                 ret = count;
     177             : 
     178             :         return ret;
     179             : }
     180           0 : BDI_SHOW(min_ratio, bdi->min_ratio)
     181             : 
     182           0 : static ssize_t max_ratio_store(struct device *dev,
     183             :                 struct device_attribute *attr, const char *buf, size_t count)
     184             : {
     185           0 :         struct backing_dev_info *bdi = dev_get_drvdata(dev);
     186             :         unsigned int ratio;
     187             :         ssize_t ret;
     188             : 
     189           0 :         ret = kstrtouint(buf, 10, &ratio);
     190           0 :         if (ret < 0)
     191             :                 return ret;
     192             : 
     193           0 :         ret = bdi_set_max_ratio(bdi, ratio);
     194           0 :         if (!ret)
     195           0 :                 ret = count;
     196             : 
     197             :         return ret;
     198             : }
     199           0 : BDI_SHOW(max_ratio, bdi->max_ratio)
     200             : 
     201           0 : static ssize_t stable_pages_required_show(struct device *dev,
     202             :                                           struct device_attribute *attr,
     203             :                                           char *buf)
     204             : {
     205           0 :         dev_warn_once(dev,
     206             :                 "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
     207           0 :         return sysfs_emit(buf, "%d\n", 0);
     208             : }
     209             : static DEVICE_ATTR_RO(stable_pages_required);
     210             : 
     211             : static struct attribute *bdi_dev_attrs[] = {
     212             :         &dev_attr_read_ahead_kb.attr,
     213             :         &dev_attr_min_ratio.attr,
     214             :         &dev_attr_max_ratio.attr,
     215             :         &dev_attr_stable_pages_required.attr,
     216             :         NULL,
     217             : };
     218             : ATTRIBUTE_GROUPS(bdi_dev);
     219             : 
     220           1 : static __init int bdi_class_init(void)
     221             : {
     222           1 :         bdi_class = class_create(THIS_MODULE, "bdi");
     223           2 :         if (IS_ERR(bdi_class))
     224           0 :                 return PTR_ERR(bdi_class);
     225             : 
     226           1 :         bdi_class->dev_groups = bdi_dev_groups;
     227             :         bdi_debug_init();
     228             : 
     229           1 :         return 0;
     230             : }
     231             : postcore_initcall(bdi_class_init);
     232             : 
     233             : static int bdi_init(struct backing_dev_info *bdi);
     234             : 
     235           1 : static int __init default_bdi_init(void)
     236             : {
     237             :         int err;
     238             : 
     239           1 :         bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
     240             :                                  WQ_SYSFS, 0);
     241           1 :         if (!bdi_wq)
     242             :                 return -ENOMEM;
     243             : 
     244           1 :         err = bdi_init(&noop_backing_dev_info);
     245             : 
     246           1 :         return err;
     247             : }
     248             : subsys_initcall(default_bdi_init);
     249             : 
     250             : /*
     251             :  * This function is used when the first inode for this wb is marked dirty. It
     252             :  * wakes-up the corresponding bdi thread which should then take care of the
     253             :  * periodic background write-out of dirty inodes. Since the write-out would
     254             :  * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
     255             :  * set up a timer which wakes the bdi thread up later.
     256             :  *
     257             :  * Note, we wouldn't bother setting up the timer, but this function is on the
     258             :  * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
     259             :  * by delaying the wake-up.
     260             :  *
     261             :  * We have to be careful not to postpone flush work if it is scheduled for
     262             :  * earlier. Thus we use queue_delayed_work().
     263             :  */
     264           0 : void wb_wakeup_delayed(struct bdi_writeback *wb)
     265             : {
     266             :         unsigned long timeout;
     267             : 
     268           0 :         timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
     269           0 :         spin_lock_bh(&wb->work_lock);
     270           0 :         if (test_bit(WB_registered, &wb->state))
     271           0 :                 queue_delayed_work(bdi_wq, &wb->dwork, timeout);
     272           0 :         spin_unlock_bh(&wb->work_lock);
     273           0 : }
     274             : 
     275           0 : static void wb_update_bandwidth_workfn(struct work_struct *work)
     276             : {
     277           0 :         struct bdi_writeback *wb = container_of(to_delayed_work(work),
     278             :                                                 struct bdi_writeback, bw_dwork);
     279             : 
     280           0 :         wb_update_bandwidth(wb);
     281           0 : }
     282             : 
     283             : /*
     284             :  * Initial write bandwidth: 100 MB/s
     285             :  */
     286             : #define INIT_BW         (100 << (20 - PAGE_SHIFT))
     287             : 
     288           1 : static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
     289             :                    gfp_t gfp)
     290             : {
     291             :         int i, err;
     292             : 
     293           1 :         memset(wb, 0, sizeof(*wb));
     294             : 
     295           1 :         wb->bdi = bdi;
     296           1 :         wb->last_old_flush = jiffies;
     297           2 :         INIT_LIST_HEAD(&wb->b_dirty);
     298           2 :         INIT_LIST_HEAD(&wb->b_io);
     299           2 :         INIT_LIST_HEAD(&wb->b_more_io);
     300           2 :         INIT_LIST_HEAD(&wb->b_dirty_time);
     301           1 :         spin_lock_init(&wb->list_lock);
     302             : 
     303           2 :         atomic_set(&wb->writeback_inodes, 0);
     304           1 :         wb->bw_time_stamp = jiffies;
     305           1 :         wb->balanced_dirty_ratelimit = INIT_BW;
     306           1 :         wb->dirty_ratelimit = INIT_BW;
     307           1 :         wb->write_bandwidth = INIT_BW;
     308           1 :         wb->avg_write_bandwidth = INIT_BW;
     309             : 
     310           1 :         spin_lock_init(&wb->work_lock);
     311           2 :         INIT_LIST_HEAD(&wb->work_list);
     312           2 :         INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
     313           2 :         INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
     314           1 :         wb->dirty_sleep = jiffies;
     315             : 
     316           1 :         err = fprop_local_init_percpu(&wb->completions, gfp);
     317           1 :         if (err)
     318             :                 return err;
     319             : 
     320           4 :         for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
     321           8 :                 err = percpu_counter_init(&wb->stat[i], 0, gfp);
     322             :                 if (err)
     323             :                         goto out_destroy_stat;
     324             :         }
     325             : 
     326             :         return 0;
     327             : 
     328             : out_destroy_stat:
     329             :         while (i--)
     330             :                 percpu_counter_destroy(&wb->stat[i]);
     331             :         fprop_local_destroy_percpu(&wb->completions);
     332             :         return err;
     333             : }
     334             : 
     335             : static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
     336             : 
     337             : /*
     338             :  * Remove bdi from the global list and shutdown any threads we have running
     339             :  */
     340           0 : static void wb_shutdown(struct bdi_writeback *wb)
     341             : {
     342             :         /* Make sure nobody queues further work */
     343           0 :         spin_lock_bh(&wb->work_lock);
     344           0 :         if (!test_and_clear_bit(WB_registered, &wb->state)) {
     345           0 :                 spin_unlock_bh(&wb->work_lock);
     346             :                 return;
     347             :         }
     348           0 :         spin_unlock_bh(&wb->work_lock);
     349             : 
     350           0 :         cgwb_remove_from_bdi_list(wb);
     351             :         /*
     352             :          * Drain work list and shutdown the delayed_work.  !WB_registered
     353             :          * tells wb_workfn() that @wb is dying and its work_list needs to
     354             :          * be drained no matter what.
     355             :          */
     356           0 :         mod_delayed_work(bdi_wq, &wb->dwork, 0);
     357           0 :         flush_delayed_work(&wb->dwork);
     358           0 :         WARN_ON(!list_empty(&wb->work_list));
     359           0 :         flush_delayed_work(&wb->bw_dwork);
     360             : }
     361             : 
     362           0 : static void wb_exit(struct bdi_writeback *wb)
     363             : {
     364             :         int i;
     365             : 
     366           0 :         WARN_ON(delayed_work_pending(&wb->dwork));
     367             : 
     368             :         for (i = 0; i < NR_WB_STAT_ITEMS; i++)
     369             :                 percpu_counter_destroy(&wb->stat[i]);
     370             : 
     371           0 :         fprop_local_destroy_percpu(&wb->completions);
     372           0 : }
     373             : 
     374             : #ifdef CONFIG_CGROUP_WRITEBACK
     375             : 
     376             : #include <linux/memcontrol.h>
     377             : 
     378             : /*
     379             :  * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and
     380             :  * memcg->cgwb_list.  bdi->cgwb_tree is also RCU protected.
     381             :  */
     382             : static DEFINE_SPINLOCK(cgwb_lock);
     383             : static struct workqueue_struct *cgwb_release_wq;
     384             : 
     385             : static LIST_HEAD(offline_cgwbs);
     386             : static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
     387             : static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);
     388             : 
     389             : static void cgwb_release_workfn(struct work_struct *work)
     390             : {
     391             :         struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
     392             :                                                 release_work);
     393             :         struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css);
     394             :         struct backing_dev_info *bdi = wb->bdi;
     395             : 
     396             :         mutex_lock(&wb->bdi->cgwb_release_mutex);
     397             :         wb_shutdown(wb);
     398             : 
     399             :         css_put(wb->memcg_css);
     400             :         css_put(wb->blkcg_css);
     401             :         mutex_unlock(&wb->bdi->cgwb_release_mutex);
     402             : 
     403             :         /* triggers blkg destruction if no online users left */
     404             :         blkcg_unpin_online(blkcg);
     405             : 
     406             :         fprop_local_destroy_percpu(&wb->memcg_completions);
     407             : 
     408             :         spin_lock_irq(&cgwb_lock);
     409             :         list_del(&wb->offline_node);
     410             :         spin_unlock_irq(&cgwb_lock);
     411             : 
     412             :         percpu_ref_exit(&wb->refcnt);
     413             :         wb_exit(wb);
     414             :         bdi_put(bdi);
     415             :         WARN_ON_ONCE(!list_empty(&wb->b_attached));
     416             :         kfree_rcu(wb, rcu);
     417             : }
     418             : 
     419             : static void cgwb_release(struct percpu_ref *refcnt)
     420             : {
     421             :         struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
     422             :                                                 refcnt);
     423             :         queue_work(cgwb_release_wq, &wb->release_work);
     424             : }
     425             : 
     426             : static void cgwb_kill(struct bdi_writeback *wb)
     427             : {
     428             :         lockdep_assert_held(&cgwb_lock);
     429             : 
     430             :         WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
     431             :         list_del(&wb->memcg_node);
     432             :         list_del(&wb->blkcg_node);
     433             :         list_add(&wb->offline_node, &offline_cgwbs);
     434             :         percpu_ref_kill(&wb->refcnt);
     435             : }
     436             : 
     437             : static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
     438             : {
     439             :         spin_lock_irq(&cgwb_lock);
     440             :         list_del_rcu(&wb->bdi_node);
     441             :         spin_unlock_irq(&cgwb_lock);
     442             : }
     443             : 
     444             : static int cgwb_create(struct backing_dev_info *bdi,
     445             :                        struct cgroup_subsys_state *memcg_css, gfp_t gfp)
     446             : {
     447             :         struct mem_cgroup *memcg;
     448             :         struct cgroup_subsys_state *blkcg_css;
     449             :         struct blkcg *blkcg;
     450             :         struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
     451             :         struct bdi_writeback *wb;
     452             :         unsigned long flags;
     453             :         int ret = 0;
     454             : 
     455             :         memcg = mem_cgroup_from_css(memcg_css);
     456             :         blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
     457             :         blkcg = css_to_blkcg(blkcg_css);
     458             :         memcg_cgwb_list = &memcg->cgwb_list;
     459             :         blkcg_cgwb_list = &blkcg->cgwb_list;
     460             : 
     461             :         /* look up again under lock and discard on blkcg mismatch */
     462             :         spin_lock_irqsave(&cgwb_lock, flags);
     463             :         wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
     464             :         if (wb && wb->blkcg_css != blkcg_css) {
     465             :                 cgwb_kill(wb);
     466             :                 wb = NULL;
     467             :         }
     468             :         spin_unlock_irqrestore(&cgwb_lock, flags);
     469             :         if (wb)
     470             :                 goto out_put;
     471             : 
     472             :         /* need to create a new one */
     473             :         wb = kmalloc(sizeof(*wb), gfp);
     474             :         if (!wb) {
     475             :                 ret = -ENOMEM;
     476             :                 goto out_put;
     477             :         }
     478             : 
     479             :         ret = wb_init(wb, bdi, gfp);
     480             :         if (ret)
     481             :                 goto err_free;
     482             : 
     483             :         ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
     484             :         if (ret)
     485             :                 goto err_wb_exit;
     486             : 
     487             :         ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
     488             :         if (ret)
     489             :                 goto err_ref_exit;
     490             : 
     491             :         wb->memcg_css = memcg_css;
     492             :         wb->blkcg_css = blkcg_css;
     493             :         INIT_LIST_HEAD(&wb->b_attached);
     494             :         INIT_WORK(&wb->release_work, cgwb_release_workfn);
     495             :         set_bit(WB_registered, &wb->state);
     496             :         bdi_get(bdi);
     497             : 
     498             :         /*
     499             :          * The root wb determines the registered state of the whole bdi and
     500             :          * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
     501             :          * whether they're still online.  Don't link @wb if any is dead.
     502             :          * See wb_memcg_offline() and wb_blkcg_offline().
     503             :          */
     504             :         ret = -ENODEV;
     505             :         spin_lock_irqsave(&cgwb_lock, flags);
     506             :         if (test_bit(WB_registered, &bdi->wb.state) &&
     507             :             blkcg_cgwb_list->next && memcg_cgwb_list->next) {
     508             :                 /* we might have raced another instance of this function */
     509             :                 ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
     510             :                 if (!ret) {
     511             :                         list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
     512             :                         list_add(&wb->memcg_node, memcg_cgwb_list);
     513             :                         list_add(&wb->blkcg_node, blkcg_cgwb_list);
     514             :                         blkcg_pin_online(blkcg);
     515             :                         css_get(memcg_css);
     516             :                         css_get(blkcg_css);
     517             :                 }
     518             :         }
     519             :         spin_unlock_irqrestore(&cgwb_lock, flags);
     520             :         if (ret) {
     521             :                 if (ret == -EEXIST)
     522             :                         ret = 0;
     523             :                 goto err_fprop_exit;
     524             :         }
     525             :         goto out_put;
     526             : 
     527             : err_fprop_exit:
     528             :         bdi_put(bdi);
     529             :         fprop_local_destroy_percpu(&wb->memcg_completions);
     530             : err_ref_exit:
     531             :         percpu_ref_exit(&wb->refcnt);
     532             : err_wb_exit:
     533             :         wb_exit(wb);
     534             : err_free:
     535             :         kfree(wb);
     536             : out_put:
     537             :         css_put(blkcg_css);
     538             :         return ret;
     539             : }
     540             : 
     541             : /**
     542             :  * wb_get_lookup - get wb for a given memcg
     543             :  * @bdi: target bdi
     544             :  * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
     545             :  *
     546             :  * Try to get the wb for @memcg_css on @bdi.  The returned wb has its
     547             :  * refcount incremented.
     548             :  *
     549             :  * This function uses css_get() on @memcg_css and thus expects its refcnt
     550             :  * to be positive on invocation.  IOW, rcu_read_lock() protection on
     551             :  * @memcg_css isn't enough.  try_get it before calling this function.
     552             :  *
     553             :  * A wb is keyed by its associated memcg.  As blkcg implicitly enables
     554             :  * memcg on the default hierarchy, memcg association is guaranteed to be
     555             :  * more specific (equal or descendant to the associated blkcg) and thus can
     556             :  * identify both the memcg and blkcg associations.
     557             :  *
     558             :  * Because the blkcg associated with a memcg may change as blkcg is enabled
     559             :  * and disabled closer to root in the hierarchy, each wb keeps track of
     560             :  * both the memcg and blkcg associated with it and verifies the blkcg on
     561             :  * each lookup.  On mismatch, the existing wb is discarded and a new one is
     562             :  * created.
     563             :  */
     564             : struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
     565             :                                     struct cgroup_subsys_state *memcg_css)
     566             : {
     567             :         struct bdi_writeback *wb;
     568             : 
     569             :         if (!memcg_css->parent)
     570             :                 return &bdi->wb;
     571             : 
     572             :         rcu_read_lock();
     573             :         wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
     574             :         if (wb) {
     575             :                 struct cgroup_subsys_state *blkcg_css;
     576             : 
     577             :                 /* see whether the blkcg association has changed */
     578             :                 blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
     579             :                 if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
     580             :                         wb = NULL;
     581             :                 css_put(blkcg_css);
     582             :         }
     583             :         rcu_read_unlock();
     584             : 
     585             :         return wb;
     586             : }
     587             : 
     588             : /**
     589             :  * wb_get_create - get wb for a given memcg, create if necessary
     590             :  * @bdi: target bdi
     591             :  * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
     592             :  * @gfp: allocation mask to use
     593             :  *
     594             :  * Try to get the wb for @memcg_css on @bdi.  If it doesn't exist, try to
     595             :  * create one.  See wb_get_lookup() for more details.
     596             :  */
     597             : struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
     598             :                                     struct cgroup_subsys_state *memcg_css,
     599             :                                     gfp_t gfp)
     600             : {
     601             :         struct bdi_writeback *wb;
     602             : 
     603             :         might_alloc(gfp);
     604             : 
     605             :         if (!memcg_css->parent)
     606             :                 return &bdi->wb;
     607             : 
     608             :         do {
     609             :                 wb = wb_get_lookup(bdi, memcg_css);
     610             :         } while (!wb && !cgwb_create(bdi, memcg_css, gfp));
     611             : 
     612             :         return wb;
     613             : }
     614             : 
     615             : static int cgwb_bdi_init(struct backing_dev_info *bdi)
     616             : {
     617             :         int ret;
     618             : 
     619             :         INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
     620             :         mutex_init(&bdi->cgwb_release_mutex);
     621             :         init_rwsem(&bdi->wb_switch_rwsem);
     622             : 
     623             :         ret = wb_init(&bdi->wb, bdi, GFP_KERNEL);
     624             :         if (!ret) {
     625             :                 bdi->wb.memcg_css = &root_mem_cgroup->css;
     626             :                 bdi->wb.blkcg_css = blkcg_root_css;
     627             :         }
     628             :         return ret;
     629             : }
     630             : 
     631             : static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
     632             : {
     633             :         struct radix_tree_iter iter;
     634             :         void **slot;
     635             :         struct bdi_writeback *wb;
     636             : 
     637             :         WARN_ON(test_bit(WB_registered, &bdi->wb.state));
     638             : 
     639             :         spin_lock_irq(&cgwb_lock);
     640             :         radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
     641             :                 cgwb_kill(*slot);
     642             :         spin_unlock_irq(&cgwb_lock);
     643             : 
     644             :         mutex_lock(&bdi->cgwb_release_mutex);
     645             :         spin_lock_irq(&cgwb_lock);
     646             :         while (!list_empty(&bdi->wb_list)) {
     647             :                 wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
     648             :                                       bdi_node);
     649             :                 spin_unlock_irq(&cgwb_lock);
     650             :                 wb_shutdown(wb);
     651             :                 spin_lock_irq(&cgwb_lock);
     652             :         }
     653             :         spin_unlock_irq(&cgwb_lock);
     654             :         mutex_unlock(&bdi->cgwb_release_mutex);
     655             : }
     656             : 
     657             : /*
     658             :  * cleanup_offline_cgwbs_workfn - try to release dying cgwbs
     659             :  *
     660             :  * Try to release dying cgwbs by switching attached inodes to the nearest
     661             :  * living ancestor's writeback. Processed wbs are placed at the end
     662             :  * of the list to guarantee the forward progress.
     663             :  */
     664             : static void cleanup_offline_cgwbs_workfn(struct work_struct *work)
     665             : {
     666             :         struct bdi_writeback *wb;
     667             :         LIST_HEAD(processed);
     668             : 
     669             :         spin_lock_irq(&cgwb_lock);
     670             : 
     671             :         while (!list_empty(&offline_cgwbs)) {
     672             :                 wb = list_first_entry(&offline_cgwbs, struct bdi_writeback,
     673             :                                       offline_node);
     674             :                 list_move(&wb->offline_node, &processed);
     675             : 
     676             :                 /*
     677             :                  * If wb is dirty, cleaning up the writeback by switching
     678             :                  * attached inodes will result in an effective removal of any
     679             :                  * bandwidth restrictions, which isn't the goal.  Instead,
     680             :                  * it can be postponed until the next time, when all io
     681             :                  * will be likely completed.  If in the meantime some inodes
     682             :                  * will get re-dirtied, they should be eventually switched to
     683             :                  * a new cgwb.
     684             :                  */
     685             :                 if (wb_has_dirty_io(wb))
     686             :                         continue;
     687             : 
     688             :                 if (!wb_tryget(wb))
     689             :                         continue;
     690             : 
     691             :                 spin_unlock_irq(&cgwb_lock);
     692             :                 while (cleanup_offline_cgwb(wb))
     693             :                         cond_resched();
     694             :                 spin_lock_irq(&cgwb_lock);
     695             : 
     696             :                 wb_put(wb);
     697             :         }
     698             : 
     699             :         if (!list_empty(&processed))
     700             :                 list_splice_tail(&processed, &offline_cgwbs);
     701             : 
     702             :         spin_unlock_irq(&cgwb_lock);
     703             : }
     704             : 
     705             : /**
     706             :  * wb_memcg_offline - kill all wb's associated with a memcg being offlined
     707             :  * @memcg: memcg being offlined
     708             :  *
     709             :  * Also prevents creation of any new wb's associated with @memcg.
     710             :  */
     711             : void wb_memcg_offline(struct mem_cgroup *memcg)
     712             : {
     713             :         struct list_head *memcg_cgwb_list = &memcg->cgwb_list;
     714             :         struct bdi_writeback *wb, *next;
     715             : 
     716             :         spin_lock_irq(&cgwb_lock);
     717             :         list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
     718             :                 cgwb_kill(wb);
     719             :         memcg_cgwb_list->next = NULL;        /* prevent new wb's */
     720             :         spin_unlock_irq(&cgwb_lock);
     721             : 
     722             :         queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work);
     723             : }
     724             : 
     725             : /**
     726             :  * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
     727             :  * @blkcg: blkcg being offlined
     728             :  *
     729             :  * Also prevents creation of any new wb's associated with @blkcg.
     730             :  */
     731             : void wb_blkcg_offline(struct blkcg *blkcg)
     732             : {
     733             :         struct bdi_writeback *wb, *next;
     734             : 
     735             :         spin_lock_irq(&cgwb_lock);
     736             :         list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
     737             :                 cgwb_kill(wb);
     738             :         blkcg->cgwb_list.next = NULL;        /* prevent new wb's */
     739             :         spin_unlock_irq(&cgwb_lock);
     740             : }
     741             : 
     742             : static void cgwb_bdi_register(struct backing_dev_info *bdi)
     743             : {
     744             :         spin_lock_irq(&cgwb_lock);
     745             :         list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
     746             :         spin_unlock_irq(&cgwb_lock);
     747             : }
     748             : 
     749             : static int __init cgwb_init(void)
     750             : {
     751             :         /*
     752             :          * There can be many concurrent release work items overwhelming
     753             :          * system_wq.  Put them in a separate wq and limit concurrency.
     754             :          * There's no point in executing many of these in parallel.
     755             :          */
     756             :         cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1);
     757             :         if (!cgwb_release_wq)
     758             :                 return -ENOMEM;
     759             : 
     760             :         return 0;
     761             : }
     762             : subsys_initcall(cgwb_init);
     763             : 
     764             : #else   /* CONFIG_CGROUP_WRITEBACK */
     765             : 
     766             : static int cgwb_bdi_init(struct backing_dev_info *bdi)
     767             : {
     768           1 :         return wb_init(&bdi->wb, bdi, GFP_KERNEL);
     769             : }
     770             : 
     771             : static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
     772             : 
     773             : static void cgwb_bdi_register(struct backing_dev_info *bdi)
     774             : {
     775           0 :         list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
     776             : }
     777             : 
     778             : static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
     779             : {
     780           0 :         list_del_rcu(&wb->bdi_node);
     781             : }
     782             : 
     783             : #endif  /* CONFIG_CGROUP_WRITEBACK */
     784             : 
     785           1 : static int bdi_init(struct backing_dev_info *bdi)
     786             : {
     787             :         int ret;
     788             : 
     789           1 :         bdi->dev = NULL;
     790             : 
     791           2 :         kref_init(&bdi->refcnt);
     792           1 :         bdi->min_ratio = 0;
     793           1 :         bdi->max_ratio = 100;
     794           1 :         bdi->max_prop_frac = FPROP_FRAC_BASE;
     795           2 :         INIT_LIST_HEAD(&bdi->bdi_list);
     796           2 :         INIT_LIST_HEAD(&bdi->wb_list);
     797           1 :         init_waitqueue_head(&bdi->wb_waitq);
     798             : 
     799           1 :         ret = cgwb_bdi_init(bdi);
     800             : 
     801           1 :         return ret;
     802             : }
     803             : 
     804           0 : struct backing_dev_info *bdi_alloc(int node_id)
     805             : {
     806             :         struct backing_dev_info *bdi;
     807             : 
     808           0 :         bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id);
     809           0 :         if (!bdi)
     810             :                 return NULL;
     811             : 
     812           0 :         if (bdi_init(bdi)) {
     813           0 :                 kfree(bdi);
     814           0 :                 return NULL;
     815             :         }
     816           0 :         bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
     817           0 :         bdi->ra_pages = VM_READAHEAD_PAGES;
     818           0 :         bdi->io_pages = VM_READAHEAD_PAGES;
     819           0 :         timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0);
     820           0 :         return bdi;
     821             : }
     822             : EXPORT_SYMBOL(bdi_alloc);
     823             : 
     824             : static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
     825             : {
     826             :         struct rb_node **p = &bdi_tree.rb_node;
     827             :         struct rb_node *parent = NULL;
     828             :         struct backing_dev_info *bdi;
     829             : 
     830             :         lockdep_assert_held(&bdi_lock);
     831             : 
     832           0 :         while (*p) {
     833           0 :                 parent = *p;
     834           0 :                 bdi = rb_entry(parent, struct backing_dev_info, rb_node);
     835             : 
     836           0 :                 if (bdi->id > id)
     837           0 :                         p = &(*p)->rb_left;
     838           0 :                 else if (bdi->id < id)
     839           0 :                         p = &(*p)->rb_right;
     840             :                 else
     841             :                         break;
     842             :         }
     843             : 
     844             :         if (parentp)
     845           0 :                 *parentp = parent;
     846             :         return p;
     847             : }
     848             : 
     849             : /**
     850             :  * bdi_get_by_id - lookup and get bdi from its id
     851             :  * @id: bdi id to lookup
     852             :  *
     853             :  * Find bdi matching @id and get it.  Returns NULL if the matching bdi
     854             :  * doesn't exist or is already unregistered.
     855             :  */
     856           0 : struct backing_dev_info *bdi_get_by_id(u64 id)
     857             : {
     858           0 :         struct backing_dev_info *bdi = NULL;
     859             :         struct rb_node **p;
     860             : 
     861             :         spin_lock_bh(&bdi_lock);
     862           0 :         p = bdi_lookup_rb_node(id, NULL);
     863           0 :         if (*p) {
     864           0 :                 bdi = rb_entry(*p, struct backing_dev_info, rb_node);
     865             :                 bdi_get(bdi);
     866             :         }
     867           0 :         spin_unlock_bh(&bdi_lock);
     868             : 
     869           0 :         return bdi;
     870             : }
     871             : 
     872           0 : int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
     873             : {
     874             :         struct device *dev;
     875             :         struct rb_node *parent, **p;
     876             : 
     877           0 :         if (bdi->dev)        /* The driver needs to use separate queues per device */
     878             :                 return 0;
     879             : 
     880           0 :         vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
     881           0 :         dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
     882           0 :         if (IS_ERR(dev))
     883           0 :                 return PTR_ERR(dev);
     884             : 
     885           0 :         cgwb_bdi_register(bdi);
     886           0 :         bdi->dev = dev;
     887             : 
     888           0 :         bdi_debug_register(bdi, dev_name(dev));
     889           0 :         set_bit(WB_registered, &bdi->wb.state);
     890             : 
     891           0 :         spin_lock_bh(&bdi_lock);
     892             : 
     893           0 :         bdi->id = ++bdi_id_cursor;
     894             : 
     895           0 :         p = bdi_lookup_rb_node(bdi->id, &parent);
     896           0 :         rb_link_node(&bdi->rb_node, parent, p);
     897           0 :         rb_insert_color(&bdi->rb_node, &bdi_tree);
     898             : 
     899           0 :         list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
     900             : 
     901           0 :         spin_unlock_bh(&bdi_lock);
     902             : 
     903           0 :         trace_writeback_bdi_register(bdi);
     904           0 :         return 0;
     905             : }
     906             : 
     907           0 : int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
     908             : {
     909             :         va_list args;
     910             :         int ret;
     911             : 
     912           0 :         va_start(args, fmt);
     913           0 :         ret = bdi_register_va(bdi, fmt, args);
     914           0 :         va_end(args);
     915           0 :         return ret;
     916             : }
     917             : EXPORT_SYMBOL(bdi_register);
     918             : 
     919           0 : void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner)
     920             : {
     921           0 :         WARN_ON_ONCE(bdi->owner);
     922           0 :         bdi->owner = owner;
     923           0 :         get_device(owner);
     924           0 : }
     925             : 
     926             : /*
     927             :  * Remove bdi from bdi_list, and ensure that it is no longer visible
     928             :  */
     929           0 : static void bdi_remove_from_list(struct backing_dev_info *bdi)
     930             : {
     931           0 :         spin_lock_bh(&bdi_lock);
     932           0 :         rb_erase(&bdi->rb_node, &bdi_tree);
     933           0 :         list_del_rcu(&bdi->bdi_list);
     934           0 :         spin_unlock_bh(&bdi_lock);
     935             : 
     936             :         synchronize_rcu_expedited();
     937           0 : }
     938             : 
     939           0 : void bdi_unregister(struct backing_dev_info *bdi)
     940             : {
     941           0 :         del_timer_sync(&bdi->laptop_mode_wb_timer);
     942             : 
     943             :         /* make sure nobody finds us on the bdi_list anymore */
     944           0 :         bdi_remove_from_list(bdi);
     945           0 :         wb_shutdown(&bdi->wb);
     946           0 :         cgwb_bdi_unregister(bdi);
     947             : 
     948             :         /*
     949             :          * If this BDI's min ratio has been set, use bdi_set_min_ratio() to
     950             :          * update the global bdi_min_ratio.
     951             :          */
     952           0 :         if (bdi->min_ratio)
     953           0 :                 bdi_set_min_ratio(bdi, 0);
     954             : 
     955           0 :         if (bdi->dev) {
     956           0 :                 bdi_debug_unregister(bdi);
     957           0 :                 device_unregister(bdi->dev);
     958           0 :                 bdi->dev = NULL;
     959             :         }
     960             : 
     961           0 :         if (bdi->owner) {
     962           0 :                 put_device(bdi->owner);
     963           0 :                 bdi->owner = NULL;
     964             :         }
     965           0 : }
     966             : EXPORT_SYMBOL(bdi_unregister);
     967             : 
     968           0 : static void release_bdi(struct kref *ref)
     969             : {
     970           0 :         struct backing_dev_info *bdi =
     971           0 :                         container_of(ref, struct backing_dev_info, refcnt);
     972             : 
     973           0 :         WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state));
     974           0 :         WARN_ON_ONCE(bdi->dev);
     975           0 :         wb_exit(&bdi->wb);
     976           0 :         kfree(bdi);
     977           0 : }
     978             : 
     979           0 : void bdi_put(struct backing_dev_info *bdi)
     980             : {
     981           0 :         kref_put(&bdi->refcnt, release_bdi);
     982           0 : }
     983             : EXPORT_SYMBOL(bdi_put);
     984             : 
     985           0 : struct backing_dev_info *inode_to_bdi(struct inode *inode)
     986             : {
     987             :         struct super_block *sb;
     988             : 
     989           0 :         if (!inode)
     990             :                 return &noop_backing_dev_info;
     991             : 
     992           0 :         sb = inode->i_sb;
     993             : #ifdef CONFIG_BLOCK
     994           0 :         if (sb_is_blkdev_sb(sb))
     995           0 :                 return I_BDEV(inode)->bd_disk->bdi;
     996             : #endif
     997           0 :         return sb->s_bdi;
     998             : }
     999             : EXPORT_SYMBOL(inode_to_bdi);
    1000             : 
    1001           0 : const char *bdi_dev_name(struct backing_dev_info *bdi)
    1002             : {
    1003           0 :         if (!bdi || !bdi->dev)
    1004           0 :                 return bdi_unknown_name;
    1005           0 :         return bdi->dev_name;
    1006             : }
    1007             : EXPORT_SYMBOL_GPL(bdi_dev_name);

Generated by: LCOV version 1.14