-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
283a29b
commit 2ed1cf9
Showing
2 changed files
with
261 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,257 @@ | ||
From 69dfc1d574d2faddf770a40cf5d1f9715da33fd2 Mon Sep 17 00:00:00 2001 | ||
From: Shashank Sharma <[email protected]> | ||
Date: Mon, 7 Mar 2022 12:32:49 +0100 | ||
Subject: [PATCH 1/3] drm: Add GPU reset sysfs event | ||
|
||
This patch adds a new sysfs event, which will indicate | ||
the userland about a GPU reset, and can also provide | ||
some information like: | ||
- process ID of the process involved with the GPU reset | ||
- process name of the involved process | ||
- the GPU status info (using flags) | ||
|
||
This patch also introduces the first flag of the flags | ||
bitmap, which can be appended as and when required. | ||
|
||
V2: Addressed review comments from Christian and Amar | ||
- move the reset information structure to DRM layer | ||
- drop _ctx from struct name | ||
- make pid 32 bit(than 64) | ||
- set flag when VRAM invalid (than valid) | ||
- add process name as well (Amar) | ||
|
||
Cc: Alexandar Deucher <[email protected]> | ||
Cc: Christian Koenig <[email protected]> | ||
Cc: Amaranath Somalapuram <[email protected]> | ||
Signed-off-by: Shashank Sharma <[email protected]> | ||
(cherry picked from commit 90230bd9d9c7d979038547460c9a2cbbeff8d6b9) | ||
[Forward port to 6.0] | ||
Signed-off-by: Cristian Ciocaltea <[email protected]> | ||
--- | ||
drivers/gpu/drm/drm_sysfs.c | 31 +++++++++++++++++++++++++++++++ | ||
include/drm/drm_sysfs.h | 10 ++++++++++ | ||
2 files changed, 41 insertions(+) | ||
|
||
diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c | ||
index 430e00b16eeca..4cf1b9b40e707 100644 | ||
--- a/drivers/gpu/drm/drm_sysfs.c | ||
+++ b/drivers/gpu/drm/drm_sysfs.c | ||
@@ -434,6 +434,37 @@ void drm_sysfs_connector_hotplug_event(struct drm_connector *connector) | ||
} | ||
EXPORT_SYMBOL(drm_sysfs_connector_hotplug_event); | ||
|
||
+/** | ||
+ * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU reset | ||
+ * @dev: DRM device | ||
+ * @reset_info: The contextual information about the reset (like PID, flags) | ||
+ * | ||
+ * Send a uevent for the DRM device specified by @dev. This informs | ||
+ * user that a GPU reset has occurred, so that an interested client | ||
+ * can take any recovery or profiling measure. | ||
+ */ | ||
+void drm_sysfs_reset_event(struct drm_device *dev, struct drm_reset_event *reset_info) | ||
+{ | ||
+ unsigned char pid_str[13]; | ||
+ unsigned char flags_str[15]; | ||
+ unsigned char pname_str[TASK_COMM_LEN + 6]; | ||
+ unsigned char reset_str[] = "RESET=1"; | ||
+ char *envp[] = { reset_str, pid_str, pname_str, flags_str, NULL }; | ||
+ | ||
+ if (!reset_info) { | ||
+ DRM_WARN("No reset info, not sending the event\n"); | ||
+ return; | ||
+ } | ||
+ | ||
+ DRM_DEBUG("generating reset event\n"); | ||
+ | ||
+ snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%u", reset_info->pid); | ||
+ snprintf(pname_str, ARRAY_SIZE(pname_str), "NAME=%s", reset_info->pname); | ||
+ snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", reset_info->flags); | ||
+ kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp); | ||
+} | ||
+EXPORT_SYMBOL(drm_sysfs_reset_event); | ||
+ | ||
/** | ||
* drm_sysfs_connector_status_event - generate a DRM uevent for connector | ||
* property status change | ||
diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h | ||
index 6273cac44e479..8c37d6a529328 100644 | ||
--- a/include/drm/drm_sysfs.h | ||
+++ b/include/drm/drm_sysfs.h | ||
@@ -1,17 +1,27 @@ | ||
/* SPDX-License-Identifier: GPL-2.0 */ | ||
#ifndef _DRM_SYSFS_H_ | ||
#define _DRM_SYSFS_H_ | ||
+#include <linux/sched.h> | ||
+ | ||
+#define DRM_GPU_RESET_FLAG_VRAM_INVALID (1 << 0) | ||
|
||
struct drm_device; | ||
struct device; | ||
struct drm_connector; | ||
struct drm_property; | ||
|
||
+struct drm_reset_event { | ||
+ uint32_t pid; | ||
+ uint32_t flags; | ||
+ char pname[TASK_COMM_LEN]; | ||
+}; | ||
+ | ||
int drm_class_device_register(struct device *dev); | ||
void drm_class_device_unregister(struct device *dev); | ||
|
||
void drm_sysfs_hotplug_event(struct drm_device *dev); | ||
void drm_sysfs_connector_hotplug_event(struct drm_connector *connector); | ||
+void drm_sysfs_reset_event(struct drm_device *dev, struct drm_reset_event *reset_info); | ||
void drm_sysfs_connector_status_event(struct drm_connector *connector, | ||
struct drm_property *property); | ||
#endif | ||
-- | ||
2.41.0 | ||
|
||
|
||
From b717d62ae484fe8fc2f87478960e95cad838f574 Mon Sep 17 00:00:00 2001 | ||
From: Shashank Sharma <[email protected]> | ||
Date: Mon, 7 Mar 2022 15:33:00 +0100 | ||
Subject: [PATCH 2/3] drm/amdgpu: add work function for GPU reset event | ||
|
||
This patch adds a work function, which sends a GPU reset | ||
uevent and some contextual infomration, like the PID and | ||
some status flags. This work should be scheduled during | ||
a GPU reset. | ||
|
||
The userspace can do some recovery and post-processing work | ||
based on this event and information. | ||
|
||
V2: Addressed review comments from Christian | ||
- Changed the name of the work to gpu_reset_event_work | ||
- Added a structure to accommodate some additional information | ||
(like a PID and some flags) | ||
- Do not add new structure in amdgpu.h | ||
|
||
Cc: Alexander Deucher <[email protected]> | ||
Cc: Christian Koenig <[email protected]> | ||
Cc: Amaranath Somalapuram <[email protected]> | ||
Signed-off-by: Shashank Sharma <[email protected]> | ||
(cherry picked from commit f63b09e78126f7da67b69409e2cce1d3ab2d7f46) | ||
[Forward port to 6.0] | ||
Signed-off-by: Cristian Ciocaltea <[email protected]> | ||
--- | ||
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 +++ | ||
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 +++++++++++++ | ||
2 files changed, 16 insertions(+) | ||
|
||
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h | ||
index 2eca58220550e..5c376f7f51f4f 100644 | ||
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h | ||
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h | ||
@@ -60,6 +60,7 @@ | ||
#include <drm/amdgpu_drm.h> | ||
#include <drm/drm_gem.h> | ||
#include <drm/drm_ioctl.h> | ||
+#include <drm/drm_sysfs.h> | ||
|
||
#include <kgd_kfd_interface.h> | ||
#include "dm_pp_interface.h" | ||
@@ -1002,6 +1003,7 @@ struct amdgpu_device { | ||
|
||
int asic_reset_res; | ||
struct work_struct xgmi_reset_work; | ||
+ struct work_struct gpu_reset_event_work; | ||
struct list_head reset_list; | ||
|
||
long gfx_timeout; | ||
@@ -1035,6 +1037,7 @@ struct amdgpu_device { | ||
pci_channel_state_t pci_channel_state; | ||
|
||
struct amdgpu_reset_control *reset_cntl; | ||
+ struct drm_reset_event reset_event_info; | ||
uint32_t ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE]; | ||
|
||
bool ram_is_direct_mapped; | ||
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | ||
index f1e9663b40510..08aa72743dfce 100644 | ||
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | ||
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | ||
@@ -76,6 +76,7 @@ | ||
#include <linux/pm_runtime.h> | ||
|
||
#include <drm/drm_drv.h> | ||
+#include <drm/drm_sysfs.h> | ||
|
||
MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); | ||
MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); | ||
@@ -3355,6 +3356,17 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) | ||
return amdgpu_device_asic_has_dc_support(adev->asic_type); | ||
} | ||
|
||
+static void amdgpu_device_reset_event_func(struct work_struct *__work) | ||
+{ | ||
+ struct amdgpu_device *adev = container_of(__work, struct amdgpu_device, | ||
+ gpu_reset_event_work); | ||
+ /* | ||
+ * A GPU reset has happened, inform the userspace and pass the | ||
+ * reset related information. | ||
+ */ | ||
+ drm_sysfs_reset_event(&adev->ddev, &adev->reset_event_info); | ||
+} | ||
+ | ||
static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) | ||
{ | ||
struct amdgpu_device *adev = | ||
@@ -3606,6 +3618,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, | ||
amdgpu_device_delay_enable_gfx_off); | ||
|
||
INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); | ||
+ INIT_WORK(&adev->gpu_reset_event_work, amdgpu_device_reset_event_func); | ||
|
||
adev->gfx.gfx_off_req_count = 1; | ||
adev->gfx.gfx_off_residency = 0; | ||
-- | ||
2.41.0 | ||
|
||
|
||
From b65228799e5ac96211b0e4c342d0a4478a1b6410 Mon Sep 17 00:00:00 2001 | ||
From: Somalapuram Amaranath <[email protected]> | ||
Date: Thu, 10 Mar 2022 11:31:44 +0530 | ||
Subject: [PATCH 3/3] drm/amdgpu: schedule GPU reset event work function | ||
|
||
Schedule work function with valid PID, process name, | ||
and vram lost status during a GPU reset/ recovery. | ||
|
||
Signed-off-by: Somalapuram Amaranath <[email protected]> | ||
(cherry picked from commit 293c019a84c6402b08db9579819b555b01cd613b) | ||
[Forward ported to 6.0] | ||
Signed-off-by: Cristian Ciocaltea <[email protected]> | ||
--- | ||
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 ++++++++++++++ | ||
1 file changed, 14 insertions(+) | ||
|
||
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | ||
index 08aa72743dfce..47ff7a56c6848 100644 | ||
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | ||
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | ||
@@ -4924,6 +4924,20 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, | ||
reset_context->job->vm->task_info; | ||
amdgpu_reset_capture_coredumpm(tmp_adev); | ||
#endif | ||
+ if (reset_context->job && reset_context->job->vm) { | ||
+ tmp_adev->reset_event_info.pid = | ||
+ reset_context->job->vm->task_info.pid; | ||
+ memset(tmp_adev->reset_event_info.pname, 0, TASK_COMM_LEN); | ||
+ strcpy(tmp_adev->reset_event_info.pname, | ||
+ reset_context->job->vm->task_info.process_name); | ||
+ } else { | ||
+ tmp_adev->reset_event_info.pid = 0; | ||
+ memset(tmp_adev->reset_event_info.pname, 0, TASK_COMM_LEN); | ||
+ } | ||
+ | ||
+ tmp_adev->reset_event_info.flags = vram_lost; | ||
+ schedule_work(&tmp_adev->gpu_reset_event_work); | ||
+ | ||
if (vram_lost) { | ||
DRM_INFO("VRAM is lost due to GPU reset!\n"); | ||
amdgpu_inc_vram_lost(tmp_adev); | ||
-- | ||
2.41.0 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters