Skip to content

Commit

Permalink
Add gpu-reset patches
Browse files Browse the repository at this point in the history
  • Loading branch information
BoukeHaarsma23 committed Aug 21, 2023
1 parent 283a29b commit 2ed1cf9
Show file tree
Hide file tree
Showing 2 changed files with 261 additions and 0 deletions.
257 changes: 257 additions & 0 deletions linux/0001-GPU-reset.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
From 69dfc1d574d2faddf770a40cf5d1f9715da33fd2 Mon Sep 17 00:00:00 2001
From: Shashank Sharma <[email protected]>
Date: Mon, 7 Mar 2022 12:32:49 +0100
Subject: [PATCH 1/3] drm: Add GPU reset sysfs event

This patch adds a new sysfs event, which will indicate
the userland about a GPU reset, and can also provide
some information like:
- process ID of the process involved with the GPU reset
- process name of the involved process
- the GPU status info (using flags)

This patch also introduces the first flag of the flags
bitmap, which can be appended as and when required.

V2: Addressed review comments from Christian and Amar
- move the reset information structure to DRM layer
- drop _ctx from struct name
- make pid 32 bit(than 64)
- set flag when VRAM invalid (than valid)
- add process name as well (Amar)

Cc: Alexandar Deucher <[email protected]>
Cc: Christian Koenig <[email protected]>
Cc: Amaranath Somalapuram <[email protected]>
Signed-off-by: Shashank Sharma <[email protected]>
(cherry picked from commit 90230bd9d9c7d979038547460c9a2cbbeff8d6b9)
[Forward port to 6.0]
Signed-off-by: Cristian Ciocaltea <[email protected]>
---
drivers/gpu/drm/drm_sysfs.c | 31 +++++++++++++++++++++++++++++++
include/drm/drm_sysfs.h | 10 ++++++++++
2 files changed, 41 insertions(+)

diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
index 430e00b16eeca..4cf1b9b40e707 100644
--- a/drivers/gpu/drm/drm_sysfs.c
+++ b/drivers/gpu/drm/drm_sysfs.c
@@ -434,6 +434,37 @@ void drm_sysfs_connector_hotplug_event(struct drm_connector *connector)
}
EXPORT_SYMBOL(drm_sysfs_connector_hotplug_event);

+/**
+ * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU reset
+ * @dev: DRM device
+ * @reset_info: The contextual information about the reset (like PID, flags)
+ *
+ * Send a uevent for the DRM device specified by @dev. This informs
+ * user that a GPU reset has occurred, so that an interested client
+ * can take any recovery or profiling measure.
+ */
+void drm_sysfs_reset_event(struct drm_device *dev, struct drm_reset_event *reset_info)
+{
+ unsigned char pid_str[13];
+ unsigned char flags_str[15];
+ unsigned char pname_str[TASK_COMM_LEN + 6];
+ unsigned char reset_str[] = "RESET=1";
+ char *envp[] = { reset_str, pid_str, pname_str, flags_str, NULL };
+
+ if (!reset_info) {
+ DRM_WARN("No reset info, not sending the event\n");
+ return;
+ }
+
+ DRM_DEBUG("generating reset event\n");
+
+ snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%u", reset_info->pid);
+ snprintf(pname_str, ARRAY_SIZE(pname_str), "NAME=%s", reset_info->pname);
+ snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", reset_info->flags);
+ kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
+}
+EXPORT_SYMBOL(drm_sysfs_reset_event);
+
/**
* drm_sysfs_connector_status_event - generate a DRM uevent for connector
* property status change
diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
index 6273cac44e479..8c37d6a529328 100644
--- a/include/drm/drm_sysfs.h
+++ b/include/drm/drm_sysfs.h
@@ -1,17 +1,27 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _DRM_SYSFS_H_
#define _DRM_SYSFS_H_
+#include <linux/sched.h>
+
+#define DRM_GPU_RESET_FLAG_VRAM_INVALID (1 << 0)

struct drm_device;
struct device;
struct drm_connector;
struct drm_property;

+struct drm_reset_event {
+ uint32_t pid;
+ uint32_t flags;
+ char pname[TASK_COMM_LEN];
+};
+
int drm_class_device_register(struct device *dev);
void drm_class_device_unregister(struct device *dev);

void drm_sysfs_hotplug_event(struct drm_device *dev);
void drm_sysfs_connector_hotplug_event(struct drm_connector *connector);
+void drm_sysfs_reset_event(struct drm_device *dev, struct drm_reset_event *reset_info);
void drm_sysfs_connector_status_event(struct drm_connector *connector,
struct drm_property *property);
#endif
--
2.41.0


From b717d62ae484fe8fc2f87478960e95cad838f574 Mon Sep 17 00:00:00 2001
From: Shashank Sharma <[email protected]>
Date: Mon, 7 Mar 2022 15:33:00 +0100
Subject: [PATCH 2/3] drm/amdgpu: add work function for GPU reset event

This patch adds a work function, which sends a GPU reset
uevent and some contextual infomration, like the PID and
some status flags. This work should be scheduled during
a GPU reset.

The userspace can do some recovery and post-processing work
based on this event and information.

V2: Addressed review comments from Christian
- Changed the name of the work to gpu_reset_event_work
- Added a structure to accommodate some additional information
(like a PID and some flags)
- Do not add new structure in amdgpu.h

Cc: Alexander Deucher <[email protected]>
Cc: Christian Koenig <[email protected]>
Cc: Amaranath Somalapuram <[email protected]>
Signed-off-by: Shashank Sharma <[email protected]>
(cherry picked from commit f63b09e78126f7da67b69409e2cce1d3ab2d7f46)
[Forward port to 6.0]
Signed-off-by: Cristian Ciocaltea <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 +++
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 +++++++++++++
2 files changed, 16 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 2eca58220550e..5c376f7f51f4f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -60,6 +60,7 @@
#include <drm/amdgpu_drm.h>
#include <drm/drm_gem.h>
#include <drm/drm_ioctl.h>
+#include <drm/drm_sysfs.h>

#include <kgd_kfd_interface.h>
#include "dm_pp_interface.h"
@@ -1002,6 +1003,7 @@ struct amdgpu_device {

int asic_reset_res;
struct work_struct xgmi_reset_work;
+ struct work_struct gpu_reset_event_work;
struct list_head reset_list;

long gfx_timeout;
@@ -1035,6 +1037,7 @@ struct amdgpu_device {
pci_channel_state_t pci_channel_state;

struct amdgpu_reset_control *reset_cntl;
+ struct drm_reset_event reset_event_info;
uint32_t ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];

bool ram_is_direct_mapped;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f1e9663b40510..08aa72743dfce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -76,6 +76,7 @@
#include <linux/pm_runtime.h>

#include <drm/drm_drv.h>
+#include <drm/drm_sysfs.h>

MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
@@ -3355,6 +3356,17 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
return amdgpu_device_asic_has_dc_support(adev->asic_type);
}

+static void amdgpu_device_reset_event_func(struct work_struct *__work)
+{
+ struct amdgpu_device *adev = container_of(__work, struct amdgpu_device,
+ gpu_reset_event_work);
+ /*
+ * A GPU reset has happened, inform the userspace and pass the
+ * reset related information.
+ */
+ drm_sysfs_reset_event(&adev->ddev, &adev->reset_event_info);
+}
+
static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
{
struct amdgpu_device *adev =
@@ -3606,6 +3618,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
amdgpu_device_delay_enable_gfx_off);

INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
+ INIT_WORK(&adev->gpu_reset_event_work, amdgpu_device_reset_event_func);

adev->gfx.gfx_off_req_count = 1;
adev->gfx.gfx_off_residency = 0;
--
2.41.0


From b65228799e5ac96211b0e4c342d0a4478a1b6410 Mon Sep 17 00:00:00 2001
From: Somalapuram Amaranath <[email protected]>
Date: Thu, 10 Mar 2022 11:31:44 +0530
Subject: [PATCH 3/3] drm/amdgpu: schedule GPU reset event work function

Schedule work function with valid PID, process name,
and vram lost status during a GPU reset/ recovery.

Signed-off-by: Somalapuram Amaranath <[email protected]>
(cherry picked from commit 293c019a84c6402b08db9579819b555b01cd613b)
[Forward ported to 6.0]
Signed-off-by: Cristian Ciocaltea <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 08aa72743dfce..47ff7a56c6848 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4924,6 +4924,20 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
reset_context->job->vm->task_info;
amdgpu_reset_capture_coredumpm(tmp_adev);
#endif
+ if (reset_context->job && reset_context->job->vm) {
+ tmp_adev->reset_event_info.pid =
+ reset_context->job->vm->task_info.pid;
+ memset(tmp_adev->reset_event_info.pname, 0, TASK_COMM_LEN);
+ strcpy(tmp_adev->reset_event_info.pname,
+ reset_context->job->vm->task_info.process_name);
+ } else {
+ tmp_adev->reset_event_info.pid = 0;
+ memset(tmp_adev->reset_event_info.pname, 0, TASK_COMM_LEN);
+ }
+
+ tmp_adev->reset_event_info.flags = vram_lost;
+ schedule_work(&tmp_adev->gpu_reset_event_work);
+
if (vram_lost) {
DRM_INFO("VRAM is lost due to GPU reset!\n");
amdgpu_inc_vram_lost(tmp_adev);
--
2.41.0

4 changes: 4 additions & 0 deletions linux/PKGBUILD
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ source=(
0001-ayaneo-geek-headset-fix.patch
0002-ayaneo-2-headphone-fix.patch

# GPU reset through sysfs
0001-GPU-reset.patch

#HDR patches
#0001-HDR.patch

Expand Down Expand Up @@ -93,6 +96,7 @@ b2sums=('dfb97f2c9905a150c0890b9c3d464bc13986bec7bcd038c333f4dce443e0a1988a90755
'41162c86f54fcdf7dfbe575e23f8bde4efa8886b5db311945e03bccb353286e70f6e4e83aadc5e81102cfcb8cf1ddab31cbbe016df4237abc07e7a69bc8e5c52'
'974db55c733545223d97e003e2608b3f6c0138cf5c4aeaae1409c64deb4b97b2dd00debcf3487705ad99b8a0bc6aad3df7d117622581710db9a8afc4404f15a1'
'c0aeabea11aa929178ee8ddcf7959fee90ce054a9f029895cdf9e13d440030243285866889da892b0e97980592d30bfbac57190b427bbe29839d56b0062a6d4e'
'16c00f512da12b42b68ab4ea9f66a559755c2a6cfaf07e69b4045b07fdcd16d10896e203c88bdaa726c7c800cc1917f85ab22e59eadde245b1b55b3de5f1b766'
#'065b33cdc3f92a145f9c88e3a31c3a0b8a1e1ba1c02039e9140c70694efda374c8904a34a1d8a47c95e2f950ac51492b8e4958f09f8d342688aa5917b638f76c'
'a2549254930b31e2b1eea76120c74a61787abdb845d46a8532a02e1dde785a236708f3332555a3af839235d54de1146f86f0bf60b5d86d3af2ec044ecc88886a'
'525576f349d77ae5aab136b3effd283e7acbc4eab3553d103d50f12878bc8bbcf5acee74b8ea829ff85f6a08e460066a2aef931df2fd4565f22826a64d1b6313'
Expand Down

0 comments on commit 2ed1cf9

Please sign in to comment.