参照元†
- pg_data_t *pgdat
- struct scan_control *sc
返り値†
static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long nr_reclaimed, nr_scanned;
struct lruvec *target_lruvec;
bool reclaimable = false;
unsigned long file;
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
again:
/*
* Flush the memory cgroup stats, so that we read accurate per-memcg
* lruvec stats for heuristics.
*/
mem_cgroup_flush_stats();
memset(&sc->nr, 0, sizeof(sc->nr));
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
/*
* Determine the scan balance between anon and file LRUs.
*/
spin_lock_irq(&target_lruvec->lru_lock);
sc->anon_cost = target_lruvec->anon_cost;
sc->file_cost = target_lruvec->file_cost;
spin_unlock_irq(&target_lruvec->lru_lock);
/*
* Target desirable inactive:active list ratios for the anon
* and file LRU lists.
*/
if (!sc->force_deactivate) {
unsigned long refaults;
refaults = lruvec_page_state(target_lruvec,
WORKINGSET_ACTIVATE_ANON);
if (refaults != target_lruvec->refaults[0] ||
inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
sc->may_deactivate |= DEACTIVATE_ANON;
else
sc->may_deactivate &= ~DEACTIVATE_ANON;
/*
* When refaults are being observed, it means a new
* workingset is being established. Deactivate to get
* rid of any stale active pages quickly.
*/
refaults = lruvec_page_state(target_lruvec,
WORKINGSET_ACTIVATE_FILE);
if (refaults != target_lruvec->refaults[1] ||
inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
sc->may_deactivate |= DEACTIVATE_FILE;
else
sc->may_deactivate &= ~DEACTIVATE_FILE;
} else
sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
/*
* If we have plenty of inactive file pages that aren't
* thrashing, try to reclaim those first before touching
* anonymous pages.
*/
file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
sc->cache_trim_mode = 1;
else
sc->cache_trim_mode = 0;
/*
* Prevent the reclaimer from falling into the cache trap: as
* cache pages start out inactive, every cache fault will tip
* the scan balance towards the file LRU. And as the file LRU
* shrinks, so does the window for rotation from references.
* This means we have a runaway feedback loop where a tiny
* thrashing file LRU becomes infinitely more attractive than
* anon pages. Try to detect this based on file LRU size.
*/
if (!cgroup_reclaim(sc)) {
unsigned long total_high_wmark = 0;
unsigned long free, anon;
int z;
free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
file = node_page_state(pgdat, NR_ACTIVE_FILE) +
node_page_state(pgdat, NR_INACTIVE_FILE);
for (z = 0; z < MAX_NR_ZONES; z++) {
struct zone *zone = &pgdat->node_zones[z];
if (!managed_zone(zone))
continue;
total_high_wmark += high_wmark_pages(zone);
}
/*
* Consider anon: if that's low too, this isn't a
* runaway file reclaim problem, but rather just
* extreme pressure. Reclaim as per usual then.
*/
anon = node_page_state(pgdat, NR_INACTIVE_ANON);
sc->file_is_tiny =
file + free <= total_high_wmark &&
!(sc->may_deactivate & DEACTIVATE_ANON) &&
anon >> sc->priority;
}
shrink_node_memcgs(pgdat, sc);
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
}
/* Record the subtree's reclaim efficiency */
vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);
if (sc->nr_reclaimed - nr_reclaimed)
reclaimable = true;
if (current_is_kswapd()) {
/*
* If reclaim is isolating dirty pages under writeback,
* it implies that the long-lived page allocation rate
* is exceeding the page laundering rate. Either the
* global limits are not being effective at throttling
* processes due to the page distribution throughout
* zones or there is heavy usage of a slow backing
* device. The only option is to throttle from reclaim
* context which is not ideal as there is no guarantee
* the dirtying process is throttled in the same way
* balance_dirty_pages() manages.
*
* Once a node is flagged PGDAT_WRITEBACK, kswapd will
* count the number of pages under pages flagged for
* immediate reclaim and stall if any are encountered
* in the nr_immediate check below.
*/
if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
set_bit(PGDAT_WRITEBACK, &pgdat->flags);
/* Allow kswapd to start writing pages during reclaim.*/
if (sc->nr.unqueued_dirty == sc->nr.file_taken)
set_bit(PGDAT_DIRTY, &pgdat->flags);
/*
* If kswapd scans pages marked for immediate
* reclaim and under writeback (nr_immediate), it
* implies that pages are cycling through the LRU
* faster than they are written so also forcibly stall.
*/
if (sc->nr.immediate)
congestion_wait(BLK_RW_ASYNC, HZ/10);
}
/*
* Tag a node/memcg as congested if all the dirty pages
* scanned were backed by a congested BDI and
* wait_iff_congested will stall.
*
* Legacy memcg will stall in page writeback so avoid forcibly
* stalling in wait_iff_congested().
*/
if ((current_is_kswapd() ||
(cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
/*
* Stall direct reclaim for IO completions if underlying BDIs
* and node is congested. Allow kswapd to continue until it
* starts encountering unqueued dirty pages or cycling through
* the LRU too quickly.
*/
if (!current_is_kswapd() && current_may_throttle() &&
!sc->hibernation_mode &&
test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
wait_iff_congested(BLK_RW_ASYNC, HZ/10);
if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
sc))
goto again;
/*
* Kswapd gives up on balancing particular nodes after too
* many failures to reclaim anything from them and goes to
* sleep. On reclaim progress, reset the failure counter. A
* successful direct reclaim run will revive a dormant kswapd.
*/
if (reclaimable)
pgdat->kswapd_failures = 0;
}
コメント†