From 2ed1cf9ac7dfa017ecffad92ded5f4029ddf172b Mon Sep 17 00:00:00 2001 From: bouhaa Date: Mon, 21 Aug 2023 21:15:41 +0200 Subject: [PATCH] Add gpu-reset patches --- linux/0001-GPU-reset.patch | 257 +++++++++++++++++++++++++++++++++++++ linux/PKGBUILD | 4 + 2 files changed, 261 insertions(+) create mode 100644 linux/0001-GPU-reset.patch diff --git a/linux/0001-GPU-reset.patch b/linux/0001-GPU-reset.patch new file mode 100644 index 0000000..32651f4 --- /dev/null +++ b/linux/0001-GPU-reset.patch @@ -0,0 +1,257 @@ +From 69dfc1d574d2faddf770a40cf5d1f9715da33fd2 Mon Sep 17 00:00:00 2001 +From: Shashank Sharma +Date: Mon, 7 Mar 2022 12:32:49 +0100 +Subject: [PATCH 1/3] drm: Add GPU reset sysfs event + +This patch adds a new sysfs event, which will indicate +the userland about a GPU reset, and can also provide +some information like: +- process ID of the process involved with the GPU reset +- process name of the involved process +- the GPU status info (using flags) + +This patch also introduces the first flag of the flags +bitmap, which can be appended as and when required. + +V2: Addressed review comments from Christian and Amar + - move the reset information structure to DRM layer + - drop _ctx from struct name + - make pid 32 bit(than 64) + - set flag when VRAM invalid (than valid) + - add process name as well (Amar) + +Cc: Alexandar Deucher +Cc: Christian Koenig +Cc: Amaranath Somalapuram +Signed-off-by: Shashank Sharma +(cherry picked from commit 90230bd9d9c7d979038547460c9a2cbbeff8d6b9) +[Forward port to 6.0] +Signed-off-by: Cristian Ciocaltea +--- + drivers/gpu/drm/drm_sysfs.c | 31 +++++++++++++++++++++++++++++++ + include/drm/drm_sysfs.h | 10 ++++++++++ + 2 files changed, 41 insertions(+) + +diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c +index 430e00b16eeca..4cf1b9b40e707 100644 +--- a/drivers/gpu/drm/drm_sysfs.c ++++ b/drivers/gpu/drm/drm_sysfs.c +@@ -434,6 +434,37 @@ void drm_sysfs_connector_hotplug_event(struct drm_connector *connector) + } + EXPORT_SYMBOL(drm_sysfs_connector_hotplug_event); + ++/** ++ * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU reset ++ * @dev: DRM device ++ * @reset_info: The contextual information about the reset (like PID, flags) ++ * ++ * Send a uevent for the DRM device specified by @dev. This informs ++ * user that a GPU reset has occurred, so that an interested client ++ * can take any recovery or profiling measure. ++ */ ++void drm_sysfs_reset_event(struct drm_device *dev, struct drm_reset_event *reset_info) ++{ ++ unsigned char pid_str[13]; ++ unsigned char flags_str[15]; ++ unsigned char pname_str[TASK_COMM_LEN + 6]; ++ unsigned char reset_str[] = "RESET=1"; ++ char *envp[] = { reset_str, pid_str, pname_str, flags_str, NULL }; ++ ++ if (!reset_info) { ++ DRM_WARN("No reset info, not sending the event\n"); ++ return; ++ } ++ ++ DRM_DEBUG("generating reset event\n"); ++ ++ snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%u", reset_info->pid); ++ snprintf(pname_str, ARRAY_SIZE(pname_str), "NAME=%s", reset_info->pname); ++ snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", reset_info->flags); ++ kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp); ++} ++EXPORT_SYMBOL(drm_sysfs_reset_event); ++ + /** + * drm_sysfs_connector_status_event - generate a DRM uevent for connector + * property status change +diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h +index 6273cac44e479..8c37d6a529328 100644 +--- a/include/drm/drm_sysfs.h ++++ b/include/drm/drm_sysfs.h +@@ -1,17 +1,27 @@ + /* SPDX-License-Identifier: GPL-2.0 */ + #ifndef _DRM_SYSFS_H_ + #define _DRM_SYSFS_H_ ++#include ++ ++#define DRM_GPU_RESET_FLAG_VRAM_INVALID (1 << 0) + + struct drm_device; + struct device; + struct drm_connector; + struct drm_property; + ++struct drm_reset_event { ++ uint32_t pid; ++ uint32_t flags; ++ char pname[TASK_COMM_LEN]; ++}; ++ + int drm_class_device_register(struct device *dev); + void drm_class_device_unregister(struct device *dev); + + void drm_sysfs_hotplug_event(struct drm_device *dev); + void drm_sysfs_connector_hotplug_event(struct drm_connector *connector); ++void drm_sysfs_reset_event(struct drm_device *dev, struct drm_reset_event *reset_info); + void drm_sysfs_connector_status_event(struct drm_connector *connector, + struct drm_property *property); + #endif +-- +2.41.0 + + +From b717d62ae484fe8fc2f87478960e95cad838f574 Mon Sep 17 00:00:00 2001 +From: Shashank Sharma +Date: Mon, 7 Mar 2022 15:33:00 +0100 +Subject: [PATCH 2/3] drm/amdgpu: add work function for GPU reset event + +This patch adds a work function, which sends a GPU reset +uevent and some contextual infomration, like the PID and +some status flags. This work should be scheduled during +a GPU reset. + +The userspace can do some recovery and post-processing work +based on this event and information. + +V2: Addressed review comments from Christian +- Changed the name of the work to gpu_reset_event_work +- Added a structure to accommodate some additional information + (like a PID and some flags) +- Do not add new structure in amdgpu.h + +Cc: Alexander Deucher +Cc: Christian Koenig +Cc: Amaranath Somalapuram +Signed-off-by: Shashank Sharma +(cherry picked from commit f63b09e78126f7da67b69409e2cce1d3ab2d7f46) +[Forward port to 6.0] +Signed-off-by: Cristian Ciocaltea +--- + drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 +++ + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 +++++++++++++ + 2 files changed, 16 insertions(+) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +index 2eca58220550e..5c376f7f51f4f 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +@@ -60,6 +60,7 @@ + #include + #include + #include ++#include + + #include + #include "dm_pp_interface.h" +@@ -1002,6 +1003,7 @@ struct amdgpu_device { + + int asic_reset_res; + struct work_struct xgmi_reset_work; ++ struct work_struct gpu_reset_event_work; + struct list_head reset_list; + + long gfx_timeout; +@@ -1035,6 +1037,7 @@ struct amdgpu_device { + pci_channel_state_t pci_channel_state; + + struct amdgpu_reset_control *reset_cntl; ++ struct drm_reset_event reset_event_info; + uint32_t ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE]; + + bool ram_is_direct_mapped; +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +index f1e9663b40510..08aa72743dfce 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +@@ -76,6 +76,7 @@ + #include + + #include ++#include + + MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); + MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); +@@ -3355,6 +3356,17 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) + return amdgpu_device_asic_has_dc_support(adev->asic_type); + } + ++static void amdgpu_device_reset_event_func(struct work_struct *__work) ++{ ++ struct amdgpu_device *adev = container_of(__work, struct amdgpu_device, ++ gpu_reset_event_work); ++ /* ++ * A GPU reset has happened, inform the userspace and pass the ++ * reset related information. ++ */ ++ drm_sysfs_reset_event(&adev->ddev, &adev->reset_event_info); ++} ++ + static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) + { + struct amdgpu_device *adev = +@@ -3606,6 +3618,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, + amdgpu_device_delay_enable_gfx_off); + + INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); ++ INIT_WORK(&adev->gpu_reset_event_work, amdgpu_device_reset_event_func); + + adev->gfx.gfx_off_req_count = 1; + adev->gfx.gfx_off_residency = 0; +-- +2.41.0 + + +From b65228799e5ac96211b0e4c342d0a4478a1b6410 Mon Sep 17 00:00:00 2001 +From: Somalapuram Amaranath +Date: Thu, 10 Mar 2022 11:31:44 +0530 +Subject: [PATCH 3/3] drm/amdgpu: schedule GPU reset event work function + +Schedule work function with valid PID, process name, +and vram lost status during a GPU reset/ recovery. + +Signed-off-by: Somalapuram Amaranath +(cherry picked from commit 293c019a84c6402b08db9579819b555b01cd613b) +[Forward ported to 6.0] +Signed-off-by: Cristian Ciocaltea +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +index 08aa72743dfce..47ff7a56c6848 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +@@ -4924,6 +4924,20 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, + reset_context->job->vm->task_info; + amdgpu_reset_capture_coredumpm(tmp_adev); + #endif ++ if (reset_context->job && reset_context->job->vm) { ++ tmp_adev->reset_event_info.pid = ++ reset_context->job->vm->task_info.pid; ++ memset(tmp_adev->reset_event_info.pname, 0, TASK_COMM_LEN); ++ strcpy(tmp_adev->reset_event_info.pname, ++ reset_context->job->vm->task_info.process_name); ++ } else { ++ tmp_adev->reset_event_info.pid = 0; ++ memset(tmp_adev->reset_event_info.pname, 0, TASK_COMM_LEN); ++ } ++ ++ tmp_adev->reset_event_info.flags = vram_lost; ++ schedule_work(&tmp_adev->gpu_reset_event_work); ++ + if (vram_lost) { + DRM_INFO("VRAM is lost due to GPU reset!\n"); + amdgpu_inc_vram_lost(tmp_adev); +-- +2.41.0 + diff --git a/linux/PKGBUILD b/linux/PKGBUILD index bd721a2..cca744a 100644 --- a/linux/PKGBUILD +++ b/linux/PKGBUILD @@ -56,6 +56,9 @@ source=( 0001-ayaneo-geek-headset-fix.patch 0002-ayaneo-2-headphone-fix.patch + # GPU reset through sysfs + 0001-GPU-reset.patch + #HDR patches #0001-HDR.patch @@ -93,6 +96,7 @@ b2sums=('dfb97f2c9905a150c0890b9c3d464bc13986bec7bcd038c333f4dce443e0a1988a90755 '41162c86f54fcdf7dfbe575e23f8bde4efa8886b5db311945e03bccb353286e70f6e4e83aadc5e81102cfcb8cf1ddab31cbbe016df4237abc07e7a69bc8e5c52' '974db55c733545223d97e003e2608b3f6c0138cf5c4aeaae1409c64deb4b97b2dd00debcf3487705ad99b8a0bc6aad3df7d117622581710db9a8afc4404f15a1' 'c0aeabea11aa929178ee8ddcf7959fee90ce054a9f029895cdf9e13d440030243285866889da892b0e97980592d30bfbac57190b427bbe29839d56b0062a6d4e' + '16c00f512da12b42b68ab4ea9f66a559755c2a6cfaf07e69b4045b07fdcd16d10896e203c88bdaa726c7c800cc1917f85ab22e59eadde245b1b55b3de5f1b766' #'065b33cdc3f92a145f9c88e3a31c3a0b8a1e1ba1c02039e9140c70694efda374c8904a34a1d8a47c95e2f950ac51492b8e4958f09f8d342688aa5917b638f76c' 'a2549254930b31e2b1eea76120c74a61787abdb845d46a8532a02e1dde785a236708f3332555a3af839235d54de1146f86f0bf60b5d86d3af2ec044ecc88886a' '525576f349d77ae5aab136b3effd283e7acbc4eab3553d103d50f12878bc8bbcf5acee74b8ea829ff85f6a08e460066a2aef931df2fd4565f22826a64d1b6313'