mm/oom_kill: squashed reverts to a stable state

Revert "mm, oom: fix use-after-free in oom_kill_process" This reverts commit e1bebdeedb497f03d426c85a89c3807c7e75268d. Signed-off-by: Corinna Vinschen <xda@vinschen.de> Revert "mm,oom: make oom_killer_disable() killable" This reverts commit 65a7400a432639aa8d5e572f30687fbca204b6f8. Signed-off-by: Corinna Vinschen <xda@vinschen.de> Revert "mm: oom_kill: don't ignore oom score on exiting tasks" This reverts commit d60dae46b27a8f381e4a7ad9dde870faa49fa5f1. Signed-off-by: Corinna Vinschen <xda@vinschen.de> Revert "mm/oom_kill.c: avoid attempting to kill init sharing same memory" This reverts commit 10773c0325259d6640b93c0694b5598ddf84939f. Signed-off-by: Corinna Vinschen <xda@vinschen.de> Revert "CHROMIUM: DROP: mm/oom_kill: Double-check before killing a child in our place" This reverts commit 2bdd9a2042a0e12d96c545773d9d8038c920f813. Revert "mm/oom_kill: fix the wrong task->mm == mm checks in oom_kill_process()" This reverts commit 419a313435b31821e4d045ca4b7ea1cc5fa02035. Signed-off-by: Corinna Vinschen <xda@vinschen.de> Revert "mm/oom_kill: cleanup the "kill sharing same memory" loop" This reverts commit afda78c6de38f9f66eba0955153b380d540d8276. Revert "mm/oom_kill: remove the wrong fatal_signal_pending() check in oom_kill_process()" This reverts commit acde9c2ace298b249c06ec5b0b971c333449dc09. Signed-off-by: Corinna Vinschen <xda@vinschen.de> Revert "mm, oom: remove task_lock protecting comm printing" This reverts commit 9a9ca142d250ec9de1215284857f4528c6ddb080. Signed-off-by: Corinna Vinschen <xda@vinschen.de> Revert "mm/oom_kill.c: suppress unnecessary "sharing same memory" message" This reverts commit 1aa2960f7c70d65b1481f805ac73b988faff6747. Signed-off-by: Corinna Vinschen <xda@vinschen.de> Revert "mm/oom_kill.c: reverse the order of setting TIF_MEMDIE and sending SIGKILL" This reverts commit f028aedfcfd2e2bb98921b98d3ae183387ab8fed. Revert "mm, oom: remove unnecessary variable" This reverts commit 54b0b58224146d68a11bccb5e64683ab3029373a. Revert "mm/oom_kill.c: print points as unsigned int" This reverts commit 603f975a6d4f0b56c7f6df7889ef2a704eca94a3. Signed-off-by: Corinna Vinschen <xda@vinschen.de> Revert "mm: oom_kill: simplify OOM killer locking" This reverts commit 7951a52ed35d162063fa08b27894e302fd716ccd. Revert "mm: oom_kill: remove unnecessary locking in exit_oom_victim()" This reverts commit f0739b25ac884682865d6aae7485e79489107bfb. Revert "mm: oom_kill: generalize OOM progress waitqueue" This reverts commit eb4b1243c72ba0b392bbe05dbf9f91959f70eb18. Revert "mm: oom_kill: switch test-and-clear of known TIF_MEMDIE to clear" This reverts commit e611f16275c3642cb8a6345ff2470926fef52110. Revert "mm: oom_kill: clean up victim marking and exiting interfaces" This reverts commit c6fada01b9370e3d7603b4ad8c26b56759174667. Revert "mm: oom_kill: remove unnecessary locking in oom_enable()" This reverts commit 5dd152d7351b3805f59b2b1f624722ab2f3c5fd8. Revert "oom, PM: make OOM detection in the freezer path raceless" This reverts commit 5fc5b1ddee5404a7629dd7045f54eaf8941bc11c.
author: Corinna Vinschen <xda@vinschen.de> 2019-04-16 10:32:37 +0200
committer: Moyster <oysterized@gmail.com> 2019-07-19 00:08:32 +0200
commit: 33c8b08c2bbe034fc01b346b1e6ce94913efb8ef (patch)
tree: 54013c4e0059e7fbe47f563ca60097044f6ad52b
parent: 23a81af2839a25b6a48484b261661a17132b72d2 (diff)
10 files changed, 215 insertions, 202 deletions
diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index 8251171ca..eef102355 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -218,7 +218,7 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
 		 * infrastructure. There is no real reason why the selected
 		 * task should have access to the memory reserves.
 		 */
-		mark_oom_victim(selected);
+		mark_tsk_oom_victim(selected);
 		send_sig(SIGKILL, selected, 0);
 		rem -= selected_tasksize;
 	}
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 70247e699..240384965 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -353,11 +353,8 @@ static struct sysrq_key_op sysrq_term_op = {
 
 static void moom_callback(struct work_struct *ignored)
 {
-	mutex_lock(&oom_lock);
-	if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
-			   GFP_KERNEL, 0, NULL, true))
-		pr_info("OOM request ignored because killer is disabled\n");
-	mutex_unlock(&oom_lock);
+	out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), GFP_KERNEL,
+		      0, NULL, true);
 }
 
 static DECLARE_WORK(moom_work, moom_callback);
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 660ccb535..7bc96dce8 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -86,7 +86,7 @@ extern int current_cpuset_is_being_rebound(void);
 
 extern void rebuild_sched_domains(void);
 
-extern void cpuset_print_current_mems_allowed(void);
+extern void cpuset_print_task_mems_allowed(struct task_struct *p);
 
 /*
  * get_mems_allowed is required when making decisions involving mems_allowed
@@ -219,7 +219,7 @@ static inline void rebuild_sched_domains(void)
 	partition_sched_domains(1, NULL, NULL);
 }
 
-static inline void cpuset_print_current_mems_allowed(void)
+static inline void cpuset_print_task_mems_allowed(struct task_struct *p)
 {
 }
 
diff --git a/include/linux/oom.h b/include/linux/oom.h
index acabe09da..e31b58f75 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -32,8 +32,6 @@ enum oom_scan_t {
 /* Thread is the potential origin of an oom condition; kill first on oom */
 #define OOM_FLAG_ORIGIN		((__force oom_flags_t)0x1)
 
-extern struct mutex oom_lock;
-
 static inline void set_current_oom_origin(void)
 {
 	current->signal->oom_flags |= OOM_FLAG_ORIGIN;
@@ -49,7 +47,9 @@ static inline bool oom_task_origin(const struct task_struct *p)
 	return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN);
 }
 
-extern void mark_oom_victim(struct task_struct *tsk);
+extern void mark_tsk_oom_victim(struct task_struct *tsk);
+
+extern void unmark_oom_victim(void);
 
 extern unsigned long oom_badness(struct task_struct *p,
 		struct mem_cgroup *memcg, const nodemask_t *nodemask,
@@ -62,6 +62,9 @@ extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 			     struct mem_cgroup *memcg, nodemask_t *nodemask,
 			     const char *message);
 
+extern bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_flags);
+extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags);
+
 extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
 			       int order, const nodemask_t *nodemask);
 
@@ -69,17 +72,22 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 		unsigned long totalpages, const nodemask_t *nodemask,
 		bool force_kill);
 
-extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		int order, nodemask_t *mask, bool force_kill);
-
-extern void exit_oom_victim(void);
-
 extern int register_oom_notifier(struct notifier_block *nb);
 extern int unregister_oom_notifier(struct notifier_block *nb);
 
 extern bool oom_killer_disabled;
-extern bool oom_killer_disable(void);
-extern void oom_killer_enable(void);
+
+static inline void oom_killer_disable(void)
+{
+	oom_killer_disabled = true;
+}
+
+static inline void oom_killer_enable(void)
+{
+	oom_killer_disabled = false;
+}
 
 extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f3a417c1d..ab0be4565 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2601,26 +2601,28 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
 #define CPUSET_NODELIST_LEN	(256)
 
 /**
- * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
+ * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
+ * @task: pointer to task_struct of some task.
  *
- * Description: Prints current's name, cpuset name, and cached copy of its
- * mems_allowed to the kernel log.
+ * Description: Prints @task's name, cpuset name, and cached copy of its
+ * mems_allowed to the kernel log.  Must hold task_lock(task) to allow
+ * dereferencing task_cs(task).
  */
-void cpuset_print_current_mems_allowed(void)
+void cpuset_print_task_mems_allowed(struct task_struct *tsk)
 {
 	 /* Statically allocated to prevent using excess stack. */
 	static char cpuset_nodelist[CPUSET_NODELIST_LEN];
 	static DEFINE_SPINLOCK(cpuset_buffer_lock);
 
-	struct cgroup *cgrp = task_cs(current)->css.cgroup;
+	struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
 
 	rcu_read_lock();
 	spin_lock(&cpuset_buffer_lock);
 
 	nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
-			   current->mems_allowed);
-	pr_info("%s cpuset=%s mems_allowed=%s\n",
-	       current->comm, cgroup_name(cgrp), cpuset_nodelist);
+			   tsk->mems_allowed);
+	printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
+	       tsk->comm, cgroup_name(cgrp), cpuset_nodelist);
 
 	spin_unlock(&cpuset_buffer_lock);
 	rcu_read_unlock();
diff --git a/kernel/exit.c b/kernel/exit.c
index d9153652a..dffb92e9b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -458,10 +458,8 @@ static void exit_mm(struct task_struct *tsk)
 	enter_lazy_tlb(mm, current);
 	task_unlock(tsk);
 	mm_update_next_owner(mm);
-
 	mmput(mm);
-	if (test_thread_flag(TIF_MEMDIE))
-		exit_oom_victim();
+	unmark_oom_victim();
 }
 
 static struct task_struct *find_alive_thread(struct task_struct *p)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 899e07479..3bd3a1185 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -118,6 +118,30 @@ static int try_to_freeze_tasks(bool user_only)
 	return todo ? -EBUSY : 0;
 }
 
+static bool __check_frozen_processes(void)
+{
+	struct task_struct *g, *p;
+
+	for_each_process_thread(g, p)
+		if (p != current && !freezer_should_skip(p) && !frozen(p))
+			return false;
+
+	return true;
+}
+
+/*
+ * Returns true if all freezable tasks (except for current) are frozen already
+ */
+static bool check_frozen_processes(void)
+{
+	bool ret;
+
+	read_lock(&tasklist_lock);
+	ret = __check_frozen_processes();
+	read_unlock(&tasklist_lock);
+	return ret;
+}
+
 /**
  * freeze_processes - Signal user space processes to enter the refrigerator.
  * The current thread will not be frozen.  The same process that calls
@@ -128,6 +152,7 @@ static int try_to_freeze_tasks(bool user_only)
 int freeze_processes(void)
 {
 	int error;
+	int oom_kills_saved;
 
 	error = __usermodehelper_disable(UMH_FREEZING);
 	if (error)
@@ -142,22 +167,29 @@ int freeze_processes(void)
 	pm_wakeup_clear();
 	pr_info("Freezing user space processes ... ");
 	pm_freezing = true;
+	oom_kills_saved = oom_kills_count();
 	error = try_to_freeze_tasks(true);
 	if (!error) {
 		__usermodehelper_set_disable_depth(UMH_DISABLED);
-		pr_cont("done.");
+		oom_killer_disable();
+
+		/*
+		 * There might have been an OOM kill while we were
+		 * freezing tasks and the killed task might be still
+		 * on the way out so we have to double check for race.
+		 */
+		if (oom_kills_count() != oom_kills_saved &&
+		    !check_frozen_processes()) {
+			__usermodehelper_set_disable_depth(UMH_ENABLED);
+			pr_cont("OOM in progress.");
+			error = -EBUSY;
+		} else {
+			pr_cont("done.");
+		}
 	}
 	pr_cont("\n");
 	BUG_ON(in_atomic());
 
-	/*
-	 * Now that the whole userspace is frozen we need to disbale
-	 * the OOM killer to disallow any further interference with
-	 * killable tasks.
-	 */
-	if (!error && !oom_killer_disable())
-		error = -EBUSY;
-
 	if (error)
 		thaw_processes();
 	return error;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b843aeeb3..f58bcb213 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1807,16 +1807,14 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	unsigned int points = 0;
 	struct task_struct *chosen = NULL;
 
-	mutex_lock(&oom_lock);
-
 	/*
 	 * If current has a pending SIGKILL or is exiting, then automatically
 	 * select it.  The goal is to allow it to allocate so that it may
 	 * quickly exit and free its memory.
 	 */
 	if (fatal_signal_pending(current) || task_will_free_mem(current)) {
-		mark_oom_victim(current);
-		goto unlock;
+		mark_tsk_oom_victim(current);
+		return;
 	}
 
 	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
@@ -1844,7 +1842,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				mem_cgroup_iter_break(memcg, iter);
 				if (chosen)
 					put_task_struct(chosen);
-				goto unlock;
+				return;
 			case OOM_SCAN_OK:
 				break;
 			};
@@ -1865,13 +1863,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 		cgroup_iter_end(cgroup, &it);
 	}
 
-	if (chosen) {
-		points = chosen_points * 1000 / totalpages;
-		oom_kill_process(chosen, gfp_mask, order, points, totalpages,
-				 memcg, NULL, "Memory cgroup out of memory");
-	}
-unlock:
-	mutex_unlock(&oom_lock);
+	if (!chosen)
+		return;
+	points = chosen_points * 1000 / totalpages;
+	oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
+			 NULL, "Memory cgroup out of memory");
 }
 
 static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
@@ -2266,7 +2262,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
 	if (!memcg)
 		return false;
 
-	if (!handle || oom_killer_disabled)
+	if (!handle)
 		goto cleanup;
 
 	owait.memcg = memcg;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ffb71de7c..3b7e5c947 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -42,8 +42,7 @@
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
-
-DEFINE_MUTEX(oom_lock);
+static DEFINE_SPINLOCK(zone_scan_lock);
 
 static unsigned long last_victim;
 
@@ -300,6 +299,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 	if (oom_task_origin(task))
 		return OOM_SCAN_SELECT;
 
+	if (task_will_free_mem(task) && !force_kill)
+		return OOM_SCAN_ABORT;
+
 	return OOM_SCAN_OK;
 }
 
@@ -399,11 +401,13 @@ void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
 static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 			struct mem_cgroup *memcg, const nodemask_t *nodemask)
 {
+	task_lock(current);
 	pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
 		"oom_score_adj=%hd\n",
 		current->comm, gfp_mask, order,
 		current->signal->oom_score_adj);
-	cpuset_print_current_mems_allowed();
+	cpuset_print_task_mems_allowed(current);
+	task_unlock(current);
 	dump_stack();
 	if (memcg)
 		mem_cgroup_print_oom_info(memcg, p);
@@ -414,26 +418,30 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 }
 
 /*
- * Number of OOM victims in flight
+ * Number of OOM killer invocations (including memcg OOM killer).
+ * Primarily used by PM freezer to check for potential races with
+ * OOM killed frozen task.
  */
-static atomic_t oom_victims = ATOMIC_INIT(0);
-static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
+static atomic_t oom_kills = ATOMIC_INIT(0);
 
-bool oom_killer_disabled __read_mostly;
+int oom_kills_count(void)
+{
+	return atomic_read(&oom_kills);
+}
+
+void note_oom_kill(void)
+{
+	atomic_inc(&oom_kills);
+}
 
 /**
- * mark_oom_victim - mark the given task as OOM victim
+ * mark_tsk_oom_victim - marks the given taks as OOM victim.
  * @tsk: task to mark
- *
- * Has to be called with oom_lock held and never after
- * oom has been disabled already.
  */
-void mark_oom_victim(struct task_struct *tsk)
+void mark_tsk_oom_victim(struct task_struct *tsk)
 {
-	WARN_ON(oom_killer_disabled);
-	/* OOM killer might race with memcg OOM */
-	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
-		return;
+	set_tsk_thread_flag(tsk, TIF_MEMDIE);
+
 	/*
 	 * Make sure that the task is woken up from uninterruptible sleep
 	 * if it is frozen because OOM killer wouldn't be able to free
@@ -441,73 +449,14 @@ void mark_oom_victim(struct task_struct *tsk)
 	 * that TIF_MEMDIE tasks should be ignored.
 	 */
 	__thaw_task(tsk);
-	atomic_inc(&oom_victims);
 }
 
 /**
- * exit_oom_victim - note the exit of an OOM victim
+ * unmark_oom_victim - unmarks the current task as OOM victim.
  */
-void exit_oom_victim(void)
+void unmark_oom_victim(void)
 {
 	clear_thread_flag(TIF_MEMDIE);
-
-	if (!atomic_dec_return(&oom_victims))
-		wake_up_all(&oom_victims_wait);
-}
-
-/**
- * oom_killer_disable - disable OOM killer
- *
- * Forces all page allocations to fail rather than trigger OOM killer.
- * Will block and wait until all OOM victims are killed.
- *
- * The function cannot be called when there are runnable user tasks because
- * the userspace would see unexpected allocation failures as a result. Any
- * new usage of this function should be consulted with MM people.
- *
- * Returns true if successful and false if the OOM killer cannot be
- * disabled.
- */
-bool oom_killer_disable(void)
-{
-	/*
-	 * Make sure to not race with an ongoing OOM killer. Check that the
-	 * current is not killed (possibly due to sharing the victim's memory).
-	 */
-	if (mutex_lock_killable(&oom_lock))
-		return false;
-	oom_killer_disabled = true;
-	mutex_unlock(&oom_lock);
-
-	wait_event(oom_victims_wait, !atomic_read(&oom_victims));
-
-	return true;
-}
-
-/**
- * oom_killer_enable - enable OOM killer
- */
-void oom_killer_enable(void)
-{
-	oom_killer_disabled = false;
-}
-
-/*
- * task->mm can be NULL if the task is the exited group leader.  So to
- * determine whether the task is using a particular mm, we examine all the
- * task's threads: if one of those is using this mm then this task was also
- * using it.
- */
-static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
-{
-	struct task_struct *t;
-
-	for_each_thread(p, t) {
-		struct mm_struct *t_mm = READ_ONCE(t->mm);
-		if (t_mm)
-			return t_mm == mm;
-	}
-	return false;
 }
 
 #define K(x) ((x) << (PAGE_SHIFT-10))
@@ -534,7 +483,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	 */
 	task_lock(p);
 	if (p->mm && task_will_free_mem(p)) {
-		mark_oom_victim(p);
+		mark_tsk_oom_victim(p);
 		task_unlock(p);
 		last_victim = jiffies;
 		put_task_struct(p);
@@ -545,8 +494,10 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	if (__ratelimit(&oom_rs))
 		dump_header(p, gfp_mask, order, memcg, nodemask);
 
-	pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
+	task_lock(p);
+	pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
 		message, task_pid_nr(p), p->comm, points);
+	task_unlock(p);
 
 	/*
 	 * If any of p's children has a different mm and is eligible for kill,
@@ -555,17 +506,9 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	 * still freeing memory.
 	 */
 	read_lock(&tasklist_lock);
-
-	/*
-	 * The task 'p' might have already exited before reaching here. The
-	 * put_task_struct() will free task_struct 'p' while the loop still try
-	 * to access the field of 'p', so, get an extra reference.
-	 */
-	get_task_struct(p);
 	for_each_thread(p, t) {
 		list_for_each_entry(child, &t->children, sibling) {
 			unsigned int child_points;
-			enum oom_scan_t scan_result;
 
             /*LCH add for race condition*/
             if (task_will_free_mem(p)) {
@@ -578,16 +521,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 		        return;
             }
 
-			if (process_shares_mm(child, p->mm))
-				continue;
-
-			/* Make sure no objections to killing the child */
-			scan_result = oom_scan_process_thread(child, totalpages,
-				nodemask, false);
-			if (scan_result == OOM_SCAN_CONTINUE ||
-			    scan_result == OOM_SCAN_ABORT)
+			if (child->mm == p->mm)
 				continue;
-
 			/*
 			 * oom_badness() returns 0 if the thread is unkillable
 			 */
@@ -601,7 +536,6 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 			}
 		}
 	}
-	put_task_struct(p);
 	read_unlock(&tasklist_lock);
 
 	p = find_lock_task_mm(victim);
@@ -616,13 +550,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 
 	/* mm cannot safely be dereferenced after task_unlock(victim) */
 	mm = victim->mm;
-	/*
-	 * We should send SIGKILL before setting TIF_MEMDIE in order to prevent
-	 * the OOM victim from depleting the memory reserves from the user
-	 * space under its control.
-	 */
-	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
-	mark_oom_victim(victim);
+	mark_tsk_oom_victim(victim);
 	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
 		task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
 		K(get_mm_counter(victim->mm, MM_ANONPAGES)),
@@ -639,22 +567,22 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	 * pending fatal signal.
 	 */
 	rcu_read_lock();
-	for_each_process(p) {
-		if (!process_shares_mm(p, mm))
-			continue;
-		if (same_thread_group(p, victim))
-			continue;
-		if (unlikely(p->flags & PF_KTHREAD))
-			continue;
-		if (is_global_init(p))
-			continue;
-		if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-			continue;
-		do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
-	}
+	for_each_process(p)
+		if (p->mm == mm && !same_thread_group(p, victim) &&
+		    !(p->flags & PF_KTHREAD)) {
+			if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+				continue;
+
+			task_lock(p);	/* Protect ->comm from prctl() */
+			pr_err("Kill process %d (%s) sharing same memory\n",
+				task_pid_nr(p), p->comm);
+			task_unlock(p);
+			do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
+		}
 	rcu_read_unlock();
 
 	last_victim = jiffies;
+	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
 	put_task_struct(victim);
 }
 #undef K
@@ -695,8 +623,54 @@ int unregister_oom_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 
+/*
+ * Try to acquire the OOM killer lock for the zones in zonelist.  Returns zero
+ * if a parallel OOM killing is already taking place that includes a zone in
+ * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
+ */
+bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask)
+{
+	struct zoneref *z;
+	struct zone *zone;
+	bool ret = true;
+
+	spin_lock(&zone_scan_lock);
+	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
+		if (zone_is_oom_locked(zone)) {
+			ret = false;
+			goto out;
+		}
+
+	/*
+	 * Lock each zone in the zonelist under zone_scan_lock so a parallel
+	 * call to oom_zonelist_trylock() doesn't succeed when it shouldn't.
+	 */
+	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
+		zone_set_flag(zone, ZONE_OOM_LOCKED);
+
+out:
+	spin_unlock(&zone_scan_lock);
+	return ret;
+}
+
+/*
+ * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
+ * allocation attempts with zonelists containing them may now recall the OOM
+ * killer, if necessary.
+ */
+void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
+{
+	struct zoneref *z;
+	struct zone *zone;
+
+	spin_lock(&zone_scan_lock);
+	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
+		zone_clear_flag(zone, ZONE_OOM_LOCKED);
+	spin_unlock(&zone_scan_lock);
+}
+
 /**
- * __out_of_memory - kill the "best" process when we run out of memory
+ * out_of_memory - kill the "best" process when we run out of memory
  * @zonelist: zonelist pointer
  * @gfp_mask: memory allocation flags
  * @order: amount of memory being requested as a power of 2
@@ -708,8 +682,8 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
  * OR try to be smart about which process to kill. Note that we
  * don't have to be perfect here, we just have to be good.
  */
-bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
-		   int order, nodemask_t *nodemask, bool force_kill)
+void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+		int order, nodemask_t *nodemask, bool force_kill)
 {
 	const nodemask_t *mpol_mask;
 	struct task_struct *p;
@@ -717,9 +691,7 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	unsigned long freed = 0;
 	unsigned int uninitialized_var(points);
 	enum oom_constraint constraint = CONSTRAINT_NONE;
-
-	if (oom_killer_disabled)
-		return false;
+	int killed = 0;
 
 #ifdef CONFIG_MT_ENG_BUILD
 	//void add_kmem_status_oom_counter(void);
@@ -729,7 +701,7 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
 	if (freed > 0)
 		/* Got some memory back in the last second. */
-		return true;
+		return;
 
 	/*
 	 * If current has a pending SIGKILL or is exiting, then automatically
@@ -741,9 +713,9 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	 */
 	if (current->mm &&
 	    (fatal_signal_pending(current) || task_will_free_mem(current))) {
-		mark_oom_victim(current);
+		mark_tsk_oom_victim(current);
 		last_victim = jiffies;
-		return true;
+		return;
 	}
 
 	/*
@@ -762,7 +734,7 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
 				 nodemask,
 				 "Out of memory (oom_kill_allocating_task)");
-		return true;
+		goto out;
 	}
 
 	p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
@@ -774,13 +746,15 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	if (PTR_ERR(p) != -1UL) {
 		oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
 				 nodemask, "Out of memory");
-		/*
-		 * Give the killed process a good chance to exit before trying
-		 * to allocate memory again.
-		 */
-		schedule_timeout_killable(1);
+		killed = 1;
 	}
-	return true;
+out:
+	/*
+	 * Give the killed threads a good chance of exiting before trying to
+	 * allocate memory again.
+	 */
+	if (killed)
+		schedule_timeout_killable(1);
 }
 
 /*
@@ -790,21 +764,14 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
  */
 void pagefault_out_of_memory(void)
 {
-	if (mem_cgroup_oom_synchronize(true))
-		return;
+	struct zonelist *zonelist;
 
-	if (!mutex_trylock(&oom_lock))
+	if (mem_cgroup_oom_synchronize(true))
 		return;
 
-	if (!out_of_memory(NULL, 0, 0, NULL, false)) {
-		/*
-		 * There shouldn't be any user tasks runnable while the
-		 * OOM killer is disabled, so the current task has to
-		 * be a racing OOM victim for which oom_killer_disable()
-		 * is waiting for.
-		 */
-		WARN_ON(test_thread_flag(TIF_MEMDIE));
+	zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
+	if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
+		out_of_memory(NULL, 0, 0, NULL, false);
+		oom_zonelist_unlock(zonelist, GFP_KERNEL);
 	}
-
-	mutex_unlock(&oom_lock);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4025d9979..82a296ec2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -281,6 +281,8 @@ void __meminit set_pageblock_mobility(struct page *page, int mobility)
 }
 #endif
 
+bool oom_killer_disabled __read_mostly;
+
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
@@ -2529,17 +2531,28 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 
 	*did_some_progress = 0;
 
+	if (oom_killer_disabled)
+		return NULL;
+
 	/*
-	 * Acquire the oom lock.  If that fails, somebody else is
-	 * making progress for us.
+	 * Acquire the per-zone oom lock for each zone.  If that
+	 * fails, somebody else is making progress for us.
 	 */
-	if (!mutex_trylock(&oom_lock)) {
+	if (!oom_zonelist_trylock(zonelist, gfp_mask)) {
 		*did_some_progress = 1;
 		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
 
 	/*
+	 * PM-freezer should be notified that there might be an OOM killer on
+	 * its way to kill and wake somebody up. This is too early and we might
+	 * end up not killing anything but false positives are acceptable.
+	 * See freeze_processes.
+	 */
+	note_oom_kill();
+
+	/*
 	 * Go through the zonelist yet one more time, keep very high watermark
 	 * here, this is only to catch a parallel oom killing, we must fail if
 	 * we're still under heavy pressure.
@@ -2575,10 +2588,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 			goto out;
 	}
 	/* Exhausted what can be done so it's blamo time */
-	if (out_of_memory(zonelist, gfp_mask, order, nodemask, false))
-		*did_some_progress = 1;
+	out_of_memory(zonelist, gfp_mask, order, nodemask, false);
+	*did_some_progress = 1;
 out:
-	mutex_unlock(&oom_lock);
+	oom_zonelist_unlock(zonelist, gfp_mask);
 	return page;
 }
author	Corinna Vinschen <xda@vinschen.de>	2019-04-16 10:32:37 +0200
committer	Moyster <oysterized@gmail.com>	2019-07-19 00:08:32 +0200
commit	33c8b08c2bbe034fc01b346b1e6ce94913efb8ef (patch)
tree	54013c4e0059e7fbe47f563ca60097044f6ad52b
parent	23a81af2839a25b6a48484b261661a17132b72d2 (diff)