Skip to content

Commit e1b849c

Browse files
jankarabrauner
authored andcommitted
writeback: Avoid contention on wb->list_lock when switching inodes
There can be multiple inode switch works that are trying to switch inodes to / from the same wb. This can happen in particular if some cgroup exits which owns many (thousands) inodes and we need to switch them all. In this case several inode_switch_wbs_work_fn() instances will be just spinning on the same wb->list_lock while only one of them makes forward progress. This wastes CPU cycles and quickly leads to softlockup reports and unusable system. Instead of running several inode_switch_wbs_work_fn() instances in parallel switching to the same wb and contending on wb->list_lock, run just one work item per wb and manage a queue of isw items switching to this wb. Acked-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jan Kara <jack@suse.cz>
1 parent 8f5ae30 commit e1b849c

File tree

4 files changed

+74
-36
lines changed

4 files changed

+74
-36
lines changed

fs/fs-writeback.c

Lines changed: 63 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
368368
}
369369

370370
struct inode_switch_wbs_context {
371-
struct rcu_work work;
371+
/* List of queued switching contexts for the wb */
372+
struct llist_node list;
372373

373374
/*
374375
* Multiple inodes can be switched at once. The switching procedure
@@ -378,7 +379,6 @@ struct inode_switch_wbs_context {
378379
* array embedded into struct inode_switch_wbs_context. Otherwise
379380
* an inode could be left in a non-consistent state.
380381
*/
381-
struct bdi_writeback *new_wb;
382382
struct inode *inodes[];
383383
};
384384

@@ -486,13 +486,11 @@ static bool inode_do_switch_wbs(struct inode *inode,
486486
return switched;
487487
}
488488

489-
static void inode_switch_wbs_work_fn(struct work_struct *work)
489+
static void process_inode_switch_wbs(struct bdi_writeback *new_wb,
490+
struct inode_switch_wbs_context *isw)
490491
{
491-
struct inode_switch_wbs_context *isw =
492-
container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
493492
struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
494493
struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
495-
struct bdi_writeback *new_wb = isw->new_wb;
496494
unsigned long nr_switched = 0;
497495
struct inode **inodep;
498496

@@ -543,6 +541,38 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
543541
atomic_dec(&isw_nr_in_flight);
544542
}
545543

544+
void inode_switch_wbs_work_fn(struct work_struct *work)
545+
{
546+
struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback,
547+
switch_work);
548+
struct inode_switch_wbs_context *isw, *next_isw;
549+
struct llist_node *list;
550+
551+
/*
552+
* Grab out reference to wb so that it cannot get freed under us
553+
* after we process all the isw items.
554+
*/
555+
wb_get(new_wb);
556+
while (1) {
557+
list = llist_del_all(&new_wb->switch_wbs_ctxs);
558+
/* Nothing to do? */
559+
if (!list)
560+
break;
561+
/*
562+
* In addition to synchronizing among switchers, I_WB_SWITCH
563+
* tells the RCU protected stat update paths to grab the i_page
564+
* lock so that stat transfer can synchronize against them.
565+
* Let's continue after I_WB_SWITCH is guaranteed to be
566+
* visible.
567+
*/
568+
synchronize_rcu();
569+
570+
llist_for_each_entry_safe(isw, next_isw, list, list)
571+
process_inode_switch_wbs(new_wb, isw);
572+
}
573+
wb_put(new_wb);
574+
}
575+
546576
static bool inode_prepare_wbs_switch(struct inode *inode,
547577
struct bdi_writeback *new_wb)
548578
{
@@ -572,6 +602,13 @@ static bool inode_prepare_wbs_switch(struct inode *inode,
572602
return true;
573603
}
574604

605+
static void wb_queue_isw(struct bdi_writeback *wb,
606+
struct inode_switch_wbs_context *isw)
607+
{
608+
if (llist_add(&isw->list, &wb->switch_wbs_ctxs))
609+
queue_work(isw_wq, &wb->switch_work);
610+
}
611+
575612
/**
576613
* inode_switch_wbs - change the wb association of an inode
577614
* @inode: target inode
@@ -585,6 +622,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
585622
struct backing_dev_info *bdi = inode_to_bdi(inode);
586623
struct cgroup_subsys_state *memcg_css;
587624
struct inode_switch_wbs_context *isw;
625+
struct bdi_writeback *new_wb = NULL;
588626

589627
/* noop if seems to be already in progress */
590628
if (inode->i_state & I_WB_SWITCH)
@@ -609,40 +647,34 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
609647
if (!memcg_css)
610648
goto out_free;
611649

612-
isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
650+
new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
613651
css_put(memcg_css);
614-
if (!isw->new_wb)
652+
if (!new_wb)
615653
goto out_free;
616654

617-
if (!inode_prepare_wbs_switch(inode, isw->new_wb))
655+
if (!inode_prepare_wbs_switch(inode, new_wb))
618656
goto out_free;
619657

620658
isw->inodes[0] = inode;
621659

622-
/*
623-
* In addition to synchronizing among switchers, I_WB_SWITCH tells
624-
* the RCU protected stat update paths to grab the i_page
625-
* lock so that stat transfer can synchronize against them.
626-
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
627-
*/
628-
INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
629-
queue_rcu_work(isw_wq, &isw->work);
660+
wb_queue_isw(new_wb, isw);
630661
return;
631662

632663
out_free:
633664
atomic_dec(&isw_nr_in_flight);
634-
if (isw->new_wb)
635-
wb_put(isw->new_wb);
665+
if (new_wb)
666+
wb_put(new_wb);
636667
kfree(isw);
637668
}
638669

639-
static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
670+
static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb,
671+
struct inode_switch_wbs_context *isw,
640672
struct list_head *list, int *nr)
641673
{
642674
struct inode *inode;
643675

644676
list_for_each_entry(inode, list, i_io_list) {
645-
if (!inode_prepare_wbs_switch(inode, isw->new_wb))
677+
if (!inode_prepare_wbs_switch(inode, new_wb))
646678
continue;
647679

648680
isw->inodes[*nr] = inode;
@@ -666,6 +698,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
666698
{
667699
struct cgroup_subsys_state *memcg_css;
668700
struct inode_switch_wbs_context *isw;
701+
struct bdi_writeback *new_wb;
669702
int nr;
670703
bool restart = false;
671704

@@ -678,12 +711,12 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
678711

679712
for (memcg_css = wb->memcg_css->parent; memcg_css;
680713
memcg_css = memcg_css->parent) {
681-
isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
682-
if (isw->new_wb)
714+
new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
715+
if (new_wb)
683716
break;
684717
}
685-
if (unlikely(!isw->new_wb))
686-
isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
718+
if (unlikely(!new_wb))
719+
new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
687720

688721
nr = 0;
689722
spin_lock(&wb->list_lock);
@@ -695,27 +728,21 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
695728
* bandwidth restrictions, as writeback of inode metadata is not
696729
* accounted for.
697730
*/
698-
restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
731+
restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr);
699732
if (!restart)
700-
restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
733+
restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time,
734+
&nr);
701735
spin_unlock(&wb->list_lock);
702736

703737
/* no attached inodes? bail out */
704738
if (nr == 0) {
705739
atomic_dec(&isw_nr_in_flight);
706-
wb_put(isw->new_wb);
740+
wb_put(new_wb);
707741
kfree(isw);
708742
return restart;
709743
}
710744

711-
/*
712-
* In addition to synchronizing among switchers, I_WB_SWITCH tells
713-
* the RCU protected stat update paths to grab the i_page
714-
* lock so that stat transfer can synchronize against them.
715-
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
716-
*/
717-
INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
718-
queue_rcu_work(isw_wq, &isw->work);
745+
wb_queue_isw(new_wb, isw);
719746

720747
return restart;
721748
}

include/linux/backing-dev-defs.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,10 @@ struct bdi_writeback {
152152
struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */
153153
struct list_head b_attached; /* attached inodes, protected by list_lock */
154154
struct list_head offline_node; /* anchored at offline_cgwbs */
155+
struct work_struct switch_work; /* work used to perform inode switching
156+
* to this wb */
157+
struct llist_head switch_wbs_ctxs; /* queued contexts for
158+
* writeback switching */
155159

156160
union {
157161
struct work_struct release_work;

include/linux/writeback.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
265265
bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
266266
}
267267

268+
void inode_switch_wbs_work_fn(struct work_struct *work);
269+
268270
#else /* CONFIG_CGROUP_WRITEBACK */
269271

270272
static inline void inode_attach_wb(struct inode *inode, struct folio *folio)

mm/backing-dev.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -633,6 +633,7 @@ static void cgwb_release_workfn(struct work_struct *work)
633633
wb_exit(wb);
634634
bdi_put(bdi);
635635
WARN_ON_ONCE(!list_empty(&wb->b_attached));
636+
WARN_ON_ONCE(work_pending(&wb->switch_work));
636637
call_rcu(&wb->rcu, cgwb_free_rcu);
637638
}
638639

@@ -709,6 +710,8 @@ static int cgwb_create(struct backing_dev_info *bdi,
709710
wb->memcg_css = memcg_css;
710711
wb->blkcg_css = blkcg_css;
711712
INIT_LIST_HEAD(&wb->b_attached);
713+
INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn);
714+
init_llist_head(&wb->switch_wbs_ctxs);
712715
INIT_WORK(&wb->release_work, cgwb_release_workfn);
713716
set_bit(WB_registered, &wb->state);
714717
bdi_get(bdi);
@@ -839,6 +842,8 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
839842
if (!ret) {
840843
bdi->wb.memcg_css = &root_mem_cgroup->css;
841844
bdi->wb.blkcg_css = blkcg_root_css;
845+
INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn);
846+
init_llist_head(&bdi->wb.switch_wbs_ctxs);
842847
}
843848
return ret;
844849
}

0 commit comments

Comments
 (0)