Line data Source code
1 : /*
2 : * Copyright 2018 Advanced Micro Devices, Inc.
3 : *
4 : * Permission is hereby granted, free of charge, to any person obtaining a
5 : * copy of this software and associated documentation files (the "Software"),
6 : * to deal in the Software without restriction, including without limitation
7 : * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 : * and/or sell copies of the Software, and to permit persons to whom the
9 : * Software is furnished to do so, subject to the following conditions:
10 : *
11 : * The above copyright notice and this permission notice shall be included in
12 : * all copies or substantial portions of the Software.
13 : *
14 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 : * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 : * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 : * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 : * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 : * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 : * OTHER DEALINGS IN THE SOFTWARE.
21 : *
22 : *
23 : */
24 : #include <linux/list.h>
25 : #include "amdgpu.h"
26 : #include "amdgpu_xgmi.h"
27 : #include "amdgpu_ras.h"
28 : #include "soc15.h"
29 : #include "df/df_3_6_offset.h"
30 : #include "xgmi/xgmi_4_0_0_smn.h"
31 : #include "xgmi/xgmi_4_0_0_sh_mask.h"
32 : #include "wafl/wafl2_4_0_0_smn.h"
33 : #include "wafl/wafl2_4_0_0_sh_mask.h"
34 :
35 : #include "amdgpu_reset.h"
36 :
37 : #define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c
38 : #define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210
39 :
40 : static DEFINE_MUTEX(xgmi_mutex);
41 :
42 : #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4
43 :
44 : static LIST_HEAD(xgmi_hive_list);
45 :
46 : static const int xgmi_pcs_err_status_reg_vg20[] = {
47 : smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
48 : smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
49 : };
50 :
51 : static const int wafl_pcs_err_status_reg_vg20[] = {
52 : smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
53 : smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
54 : };
55 :
56 : static const int xgmi_pcs_err_status_reg_arct[] = {
57 : smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
58 : smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
59 : smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x500000,
60 : smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x600000,
61 : smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x700000,
62 : smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x800000,
63 : };
64 :
65 : /* same as vg20*/
66 : static const int wafl_pcs_err_status_reg_arct[] = {
67 : smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
68 : smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
69 : };
70 :
71 : static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = {
72 : smnPCS_XGMI3X16_PCS_ERROR_STATUS,
73 : smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000,
74 : smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x200000,
75 : smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x300000,
76 : smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x400000,
77 : smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x500000,
78 : smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x600000,
79 : smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x700000
80 : };
81 :
82 : static const int walf_pcs_err_status_reg_aldebaran[] = {
83 : smnPCS_GOPX1_PCS_ERROR_STATUS,
84 : smnPCS_GOPX1_PCS_ERROR_STATUS + 0x100000
85 : };
86 :
87 : static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
88 : {"XGMI PCS DataLossErr",
89 : SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
90 : {"XGMI PCS TrainingErr",
91 : SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)},
92 : {"XGMI PCS CRCErr",
93 : SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)},
94 : {"XGMI PCS BERExceededErr",
95 : SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)},
96 : {"XGMI PCS TxMetaDataErr",
97 : SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)},
98 : {"XGMI PCS ReplayBufParityErr",
99 : SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)},
100 : {"XGMI PCS DataParityErr",
101 : SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)},
102 : {"XGMI PCS ReplayFifoOverflowErr",
103 : SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
104 : {"XGMI PCS ReplayFifoUnderflowErr",
105 : SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
106 : {"XGMI PCS ElasticFifoOverflowErr",
107 : SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
108 : {"XGMI PCS DeskewErr",
109 : SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)},
110 : {"XGMI PCS DataStartupLimitErr",
111 : SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)},
112 : {"XGMI PCS FCInitTimeoutErr",
113 : SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)},
114 : {"XGMI PCS RecoveryTimeoutErr",
115 : SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
116 : {"XGMI PCS ReadySerialTimeoutErr",
117 : SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
118 : {"XGMI PCS ReadySerialAttemptErr",
119 : SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
120 : {"XGMI PCS RecoveryAttemptErr",
121 : SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)},
122 : {"XGMI PCS RecoveryRelockAttemptErr",
123 : SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
124 : };
125 :
126 : static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = {
127 : {"WAFL PCS DataLossErr",
128 : SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)},
129 : {"WAFL PCS TrainingErr",
130 : SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)},
131 : {"WAFL PCS CRCErr",
132 : SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)},
133 : {"WAFL PCS BERExceededErr",
134 : SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)},
135 : {"WAFL PCS TxMetaDataErr",
136 : SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)},
137 : {"WAFL PCS ReplayBufParityErr",
138 : SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)},
139 : {"WAFL PCS DataParityErr",
140 : SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)},
141 : {"WAFL PCS ReplayFifoOverflowErr",
142 : SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
143 : {"WAFL PCS ReplayFifoUnderflowErr",
144 : SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
145 : {"WAFL PCS ElasticFifoOverflowErr",
146 : SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
147 : {"WAFL PCS DeskewErr",
148 : SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)},
149 : {"WAFL PCS DataStartupLimitErr",
150 : SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)},
151 : {"WAFL PCS FCInitTimeoutErr",
152 : SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)},
153 : {"WAFL PCS RecoveryTimeoutErr",
154 : SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
155 : {"WAFL PCS ReadySerialTimeoutErr",
156 : SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
157 : {"WAFL PCS ReadySerialAttemptErr",
158 : SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
159 : {"WAFL PCS RecoveryAttemptErr",
160 : SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)},
161 : {"WAFL PCS RecoveryRelockAttemptErr",
162 : SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
163 : };
164 :
165 : /**
166 : * DOC: AMDGPU XGMI Support
167 : *
168 : * XGMI is a high speed interconnect that joins multiple GPU cards
169 : * into a homogeneous memory space that is organized by a collective
170 : * hive ID and individual node IDs, both of which are 64-bit numbers.
171 : *
172 : * The file xgmi_device_id contains the unique per GPU device ID and
173 : * is stored in the /sys/class/drm/card${cardno}/device/ directory.
174 : *
175 : * Inside the device directory a sub-directory 'xgmi_hive_info' is
176 : * created which contains the hive ID and the list of nodes.
177 : *
178 : * The hive ID is stored in:
179 : * /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id
180 : *
181 : * The node information is stored in numbered directories:
182 : * /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id
183 : *
184 : * Each device has their own xgmi_hive_info direction with a mirror
185 : * set of node sub-directories.
186 : *
187 : * The XGMI memory space is built by contiguously adding the power of
188 : * two padded VRAM space from each node to each other.
189 : *
190 : */
191 :
192 : static struct attribute amdgpu_xgmi_hive_id = {
193 : .name = "xgmi_hive_id",
194 : .mode = S_IRUGO
195 : };
196 :
197 : static struct attribute *amdgpu_xgmi_hive_attrs[] = {
198 : &amdgpu_xgmi_hive_id,
199 : NULL
200 : };
201 : ATTRIBUTE_GROUPS(amdgpu_xgmi_hive);
202 :
203 0 : static ssize_t amdgpu_xgmi_show_attrs(struct kobject *kobj,
204 : struct attribute *attr, char *buf)
205 : {
206 0 : struct amdgpu_hive_info *hive = container_of(
207 : kobj, struct amdgpu_hive_info, kobj);
208 :
209 0 : if (attr == &amdgpu_xgmi_hive_id)
210 0 : return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
211 :
212 : return 0;
213 : }
214 :
215 0 : static void amdgpu_xgmi_hive_release(struct kobject *kobj)
216 : {
217 0 : struct amdgpu_hive_info *hive = container_of(
218 : kobj, struct amdgpu_hive_info, kobj);
219 :
220 0 : amdgpu_reset_put_reset_domain(hive->reset_domain);
221 0 : hive->reset_domain = NULL;
222 :
223 0 : mutex_destroy(&hive->hive_lock);
224 0 : kfree(hive);
225 0 : }
226 :
227 : static const struct sysfs_ops amdgpu_xgmi_hive_ops = {
228 : .show = amdgpu_xgmi_show_attrs,
229 : };
230 :
231 : struct kobj_type amdgpu_xgmi_hive_type = {
232 : .release = amdgpu_xgmi_hive_release,
233 : .sysfs_ops = &amdgpu_xgmi_hive_ops,
234 : .default_groups = amdgpu_xgmi_hive_groups,
235 : };
236 :
237 0 : static ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
238 : struct device_attribute *attr,
239 : char *buf)
240 : {
241 0 : struct drm_device *ddev = dev_get_drvdata(dev);
242 0 : struct amdgpu_device *adev = drm_to_adev(ddev);
243 :
244 0 : return sysfs_emit(buf, "%llu\n", adev->gmc.xgmi.node_id);
245 :
246 : }
247 :
248 : #define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801)
249 0 : static ssize_t amdgpu_xgmi_show_error(struct device *dev,
250 : struct device_attribute *attr,
251 : char *buf)
252 : {
253 0 : struct drm_device *ddev = dev_get_drvdata(dev);
254 0 : struct amdgpu_device *adev = drm_to_adev(ddev);
255 : uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in;
256 : uint64_t fica_out;
257 0 : unsigned int error_count = 0;
258 :
259 0 : ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200);
260 0 : ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208);
261 :
262 0 : if ((!adev->df.funcs) ||
263 0 : (!adev->df.funcs->get_fica) ||
264 0 : (!adev->df.funcs->set_fica))
265 : return -EINVAL;
266 :
267 0 : fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in);
268 0 : if (fica_out != 0x1f)
269 0 : pr_err("xGMI error counters not enabled!\n");
270 :
271 0 : fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in);
272 :
273 0 : if ((fica_out & 0xffff) == 2)
274 0 : error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63);
275 :
276 0 : adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0);
277 :
278 0 : return sysfs_emit(buf, "%u\n", error_count);
279 : }
280 :
281 :
282 : static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
283 : static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL);
284 :
285 0 : static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
286 : struct amdgpu_hive_info *hive)
287 : {
288 0 : int ret = 0;
289 0 : char node[10] = { 0 };
290 :
291 : /* Create xgmi device id file */
292 0 : ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id);
293 0 : if (ret) {
294 0 : dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n");
295 0 : return ret;
296 : }
297 :
298 : /* Create xgmi error file */
299 0 : ret = device_create_file(adev->dev, &dev_attr_xgmi_error);
300 0 : if (ret)
301 0 : pr_err("failed to create xgmi_error\n");
302 :
303 :
304 : /* Create sysfs link to hive info folder on the first device */
305 0 : if (hive->kobj.parent != (&adev->dev->kobj)) {
306 0 : ret = sysfs_create_link(&adev->dev->kobj, &hive->kobj,
307 : "xgmi_hive_info");
308 0 : if (ret) {
309 0 : dev_err(adev->dev, "XGMI: Failed to create link to hive info");
310 0 : goto remove_file;
311 : }
312 : }
313 :
314 0 : sprintf(node, "node%d", atomic_read(&hive->number_devices));
315 : /* Create sysfs link form the hive folder to yourself */
316 0 : ret = sysfs_create_link(&hive->kobj, &adev->dev->kobj, node);
317 0 : if (ret) {
318 0 : dev_err(adev->dev, "XGMI: Failed to create link from hive info");
319 : goto remove_link;
320 : }
321 :
322 : goto success;
323 :
324 :
325 : remove_link:
326 0 : sysfs_remove_link(&adev->dev->kobj, adev_to_drm(adev)->unique);
327 :
328 : remove_file:
329 0 : device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
330 :
331 : success:
332 : return ret;
333 : }
334 :
335 0 : static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
336 : struct amdgpu_hive_info *hive)
337 : {
338 : char node[10];
339 0 : memset(node, 0, sizeof(node));
340 :
341 0 : device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
342 0 : device_remove_file(adev->dev, &dev_attr_xgmi_error);
343 :
344 0 : if (hive->kobj.parent != (&adev->dev->kobj))
345 0 : sysfs_remove_link(&adev->dev->kobj,"xgmi_hive_info");
346 :
347 0 : sprintf(node, "node%d", atomic_read(&hive->number_devices));
348 0 : sysfs_remove_link(&hive->kobj, node);
349 :
350 0 : }
351 :
352 :
353 :
354 0 : struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
355 : {
356 0 : struct amdgpu_hive_info *hive = NULL;
357 : int ret;
358 :
359 0 : if (!adev->gmc.xgmi.hive_id)
360 : return NULL;
361 :
362 0 : if (adev->hive) {
363 0 : kobject_get(&adev->hive->kobj);
364 0 : return adev->hive;
365 : }
366 :
367 0 : mutex_lock(&xgmi_mutex);
368 :
369 0 : list_for_each_entry(hive, &xgmi_hive_list, node) {
370 0 : if (hive->hive_id == adev->gmc.xgmi.hive_id)
371 : goto pro_end;
372 : }
373 :
374 0 : hive = kzalloc(sizeof(*hive), GFP_KERNEL);
375 0 : if (!hive) {
376 0 : dev_err(adev->dev, "XGMI: allocation failed\n");
377 0 : hive = NULL;
378 0 : goto pro_end;
379 : }
380 :
381 : /* initialize new hive if not exist */
382 0 : ret = kobject_init_and_add(&hive->kobj,
383 : &amdgpu_xgmi_hive_type,
384 0 : &adev->dev->kobj,
385 : "%s", "xgmi_hive_info");
386 0 : if (ret) {
387 0 : dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi hive\n");
388 0 : kobject_put(&hive->kobj);
389 0 : kfree(hive);
390 0 : hive = NULL;
391 0 : goto pro_end;
392 : }
393 :
394 : /**
395 : * Only init hive->reset_domain for none SRIOV configuration. For SRIOV,
396 : * Host driver decide how to reset the GPU either through FLR or chain reset.
397 : * Guest side will get individual notifications from the host for the FLR
398 : * if necessary.
399 : */
400 0 : if (!amdgpu_sriov_vf(adev)) {
401 : /**
402 : * Avoid recreating reset domain when hive is reconstructed for the case
403 : * of reset the devices in the XGMI hive during probe for passthrough GPU
404 : * See https://www.spinics.net/lists/amd-gfx/msg58836.html
405 : */
406 0 : if (adev->reset_domain->type != XGMI_HIVE) {
407 0 : hive->reset_domain =
408 0 : amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive");
409 0 : if (!hive->reset_domain) {
410 0 : dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n");
411 0 : ret = -ENOMEM;
412 0 : kobject_put(&hive->kobj);
413 0 : kfree(hive);
414 0 : hive = NULL;
415 0 : goto pro_end;
416 : }
417 : } else {
418 0 : amdgpu_reset_get_reset_domain(adev->reset_domain);
419 0 : hive->reset_domain = adev->reset_domain;
420 : }
421 : }
422 :
423 0 : hive->hive_id = adev->gmc.xgmi.hive_id;
424 0 : INIT_LIST_HEAD(&hive->device_list);
425 0 : INIT_LIST_HEAD(&hive->node);
426 0 : mutex_init(&hive->hive_lock);
427 0 : atomic_set(&hive->number_devices, 0);
428 0 : task_barrier_init(&hive->tb);
429 0 : hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
430 0 : hive->hi_req_gpu = NULL;
431 :
432 : /*
433 : * hive pstate on boot is high in vega20 so we have to go to low
434 : * pstate on after boot.
435 : */
436 0 : hive->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE;
437 0 : list_add_tail(&hive->node, &xgmi_hive_list);
438 :
439 : pro_end:
440 0 : if (hive)
441 0 : kobject_get(&hive->kobj);
442 0 : mutex_unlock(&xgmi_mutex);
443 0 : return hive;
444 : }
445 :
446 0 : void amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive)
447 : {
448 0 : if (hive)
449 0 : kobject_put(&hive->kobj);
450 0 : }
451 :
452 0 : int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
453 : {
454 0 : int ret = 0;
455 : struct amdgpu_hive_info *hive;
456 : struct amdgpu_device *request_adev;
457 0 : bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20;
458 : bool init_low;
459 :
460 0 : hive = amdgpu_get_xgmi_hive(adev);
461 0 : if (!hive)
462 : return 0;
463 :
464 0 : request_adev = hive->hi_req_gpu ? hive->hi_req_gpu : adev;
465 0 : init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN;
466 0 : amdgpu_put_xgmi_hive(hive);
467 : /* fw bug so temporarily disable pstate switching */
468 0 : return 0;
469 :
470 : if (!hive || adev->asic_type != CHIP_VEGA20)
471 : return 0;
472 :
473 : mutex_lock(&hive->hive_lock);
474 :
475 : if (is_hi_req)
476 : hive->hi_req_count++;
477 : else
478 : hive->hi_req_count--;
479 :
480 : /*
481 : * Vega20 only needs single peer to request pstate high for the hive to
482 : * go high but all peers must request pstate low for the hive to go low
483 : */
484 : if (hive->pstate == pstate ||
485 : (!is_hi_req && hive->hi_req_count && !init_low))
486 : goto out;
487 :
488 : dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n", pstate);
489 :
490 : ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate);
491 : if (ret) {
492 : dev_err(request_adev->dev,
493 : "XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
494 : request_adev->gmc.xgmi.node_id,
495 : request_adev->gmc.xgmi.hive_id, ret);
496 : goto out;
497 : }
498 :
499 : if (init_low)
500 : hive->pstate = hive->hi_req_count ?
501 : hive->pstate : AMDGPU_XGMI_PSTATE_MIN;
502 : else {
503 : hive->pstate = pstate;
504 : hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ?
505 : adev : NULL;
506 : }
507 : out:
508 : mutex_unlock(&hive->hive_lock);
509 : return ret;
510 : }
511 :
512 0 : int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)
513 : {
514 : int ret;
515 :
516 0 : if (amdgpu_sriov_vf(adev))
517 : return 0;
518 :
519 : /* Each psp need to set the latest topology */
520 0 : ret = psp_xgmi_set_topology_info(&adev->psp,
521 0 : atomic_read(&hive->number_devices),
522 : &adev->psp.xgmi_context.top_info);
523 0 : if (ret)
524 0 : dev_err(adev->dev,
525 : "XGMI: Set topology failure on device %llx, hive %llx, ret %d",
526 : adev->gmc.xgmi.node_id,
527 : adev->gmc.xgmi.hive_id, ret);
528 :
529 : return ret;
530 : }
531 :
532 :
533 : /*
534 : * NOTE psp_xgmi_node_info.num_hops layout is as follows:
535 : * num_hops[7:6] = link type (0 = xGMI2, 1 = xGMI3, 2/3 = reserved)
536 : * num_hops[5:3] = reserved
537 : * num_hops[2:0] = number of hops
538 : */
539 0 : int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
540 : struct amdgpu_device *peer_adev)
541 : {
542 0 : struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
543 0 : uint8_t num_hops_mask = 0x7;
544 : int i;
545 :
546 0 : for (i = 0 ; i < top->num_nodes; ++i)
547 0 : if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
548 0 : return top->nodes[i].num_hops & num_hops_mask;
549 : return -EINVAL;
550 : }
551 :
552 0 : int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
553 : struct amdgpu_device *peer_adev)
554 : {
555 0 : struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
556 : int i;
557 :
558 0 : for (i = 0 ; i < top->num_nodes; ++i)
559 0 : if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
560 0 : return top->nodes[i].num_links;
561 : return -EINVAL;
562 : }
563 :
564 : /*
565 : * Devices that support extended data require the entire hive to initialize with
566 : * the shared memory buffer flag set.
567 : *
568 : * Hive locks and conditions apply - see amdgpu_xgmi_add_device
569 : */
570 0 : static int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_info *hive,
571 : bool set_extended_data)
572 : {
573 : struct amdgpu_device *tmp_adev;
574 : int ret;
575 :
576 0 : list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
577 0 : ret = psp_xgmi_initialize(&tmp_adev->psp, set_extended_data, false);
578 0 : if (ret) {
579 0 : dev_err(tmp_adev->dev,
580 : "XGMI: Failed to initialize xgmi session for data partition %i\n",
581 : set_extended_data);
582 0 : return ret;
583 : }
584 :
585 : }
586 :
587 : return 0;
588 : }
589 :
590 0 : int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
591 : {
592 : struct psp_xgmi_topology_info *top_info;
593 : struct amdgpu_hive_info *hive;
594 : struct amdgpu_xgmi *entry;
595 0 : struct amdgpu_device *tmp_adev = NULL;
596 :
597 0 : int count = 0, ret = 0;
598 :
599 0 : if (!adev->gmc.xgmi.supported)
600 : return 0;
601 :
602 0 : if (!adev->gmc.xgmi.pending_reset &&
603 0 : amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
604 0 : ret = psp_xgmi_initialize(&adev->psp, false, true);
605 0 : if (ret) {
606 0 : dev_err(adev->dev,
607 : "XGMI: Failed to initialize xgmi session\n");
608 0 : return ret;
609 : }
610 :
611 0 : ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id);
612 0 : if (ret) {
613 0 : dev_err(adev->dev,
614 : "XGMI: Failed to get hive id\n");
615 0 : return ret;
616 : }
617 :
618 0 : ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id);
619 0 : if (ret) {
620 0 : dev_err(adev->dev,
621 : "XGMI: Failed to get node id\n");
622 0 : return ret;
623 : }
624 : } else {
625 0 : adev->gmc.xgmi.hive_id = 16;
626 0 : adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16;
627 : }
628 :
629 0 : hive = amdgpu_get_xgmi_hive(adev);
630 0 : if (!hive) {
631 0 : ret = -EINVAL;
632 0 : dev_err(adev->dev,
633 : "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n",
634 : adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id);
635 0 : goto exit;
636 : }
637 0 : mutex_lock(&hive->hive_lock);
638 :
639 0 : top_info = &adev->psp.xgmi_context.top_info;
640 :
641 0 : list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
642 0 : list_for_each_entry(entry, &hive->device_list, head)
643 0 : top_info->nodes[count++].node_id = entry->node_id;
644 0 : top_info->num_nodes = count;
645 0 : atomic_set(&hive->number_devices, count);
646 :
647 0 : task_barrier_add_task(&hive->tb);
648 :
649 0 : if (!adev->gmc.xgmi.pending_reset &&
650 0 : amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
651 0 : list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
652 : /* update node list for other device in the hive */
653 0 : if (tmp_adev != adev) {
654 0 : top_info = &tmp_adev->psp.xgmi_context.top_info;
655 0 : top_info->nodes[count - 1].node_id =
656 0 : adev->gmc.xgmi.node_id;
657 0 : top_info->num_nodes = count;
658 : }
659 0 : ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
660 0 : if (ret)
661 : goto exit_unlock;
662 : }
663 :
664 : /* get latest topology info for each device from psp */
665 0 : list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
666 0 : ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
667 : &tmp_adev->psp.xgmi_context.top_info, false);
668 0 : if (ret) {
669 0 : dev_err(tmp_adev->dev,
670 : "XGMI: Get topology failure on device %llx, hive %llx, ret %d",
671 : tmp_adev->gmc.xgmi.node_id,
672 : tmp_adev->gmc.xgmi.hive_id, ret);
673 : /* To do : continue with some node failed or disable the whole hive */
674 0 : goto exit_unlock;
675 : }
676 : }
677 :
678 : /* get topology again for hives that support extended data */
679 0 : if (adev->psp.xgmi_context.supports_extended_data) {
680 :
681 : /* initialize the hive to get extended data. */
682 0 : ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, true);
683 0 : if (ret)
684 : goto exit_unlock;
685 :
686 : /* get the extended data. */
687 0 : list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
688 0 : ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
689 : &tmp_adev->psp.xgmi_context.top_info, true);
690 0 : if (ret) {
691 0 : dev_err(tmp_adev->dev,
692 : "XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d",
693 : tmp_adev->gmc.xgmi.node_id,
694 : tmp_adev->gmc.xgmi.hive_id, ret);
695 0 : goto exit_unlock;
696 : }
697 : }
698 :
699 : /* initialize the hive to get non-extended data for the next round. */
700 0 : ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, false);
701 0 : if (ret)
702 : goto exit_unlock;
703 :
704 : }
705 : }
706 :
707 0 : if (!ret && !adev->gmc.xgmi.pending_reset)
708 0 : ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
709 :
710 : exit_unlock:
711 0 : mutex_unlock(&hive->hive_lock);
712 : exit:
713 0 : if (!ret) {
714 0 : adev->hive = hive;
715 0 : dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
716 : adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id);
717 : } else {
718 0 : amdgpu_put_xgmi_hive(hive);
719 0 : dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n",
720 : adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id,
721 : ret);
722 : }
723 :
724 : return ret;
725 : }
726 :
727 0 : int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
728 : {
729 0 : struct amdgpu_hive_info *hive = adev->hive;
730 :
731 0 : if (!adev->gmc.xgmi.supported)
732 : return -EINVAL;
733 :
734 0 : if (!hive)
735 : return -EINVAL;
736 :
737 0 : mutex_lock(&hive->hive_lock);
738 0 : task_barrier_rem_task(&hive->tb);
739 0 : amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
740 0 : if (hive->hi_req_gpu == adev)
741 0 : hive->hi_req_gpu = NULL;
742 0 : list_del(&adev->gmc.xgmi.head);
743 0 : mutex_unlock(&hive->hive_lock);
744 :
745 0 : amdgpu_put_xgmi_hive(hive);
746 0 : adev->hive = NULL;
747 :
748 0 : if (atomic_dec_return(&hive->number_devices) == 0) {
749 : /* Remove the hive from global hive list */
750 0 : mutex_lock(&xgmi_mutex);
751 0 : list_del(&hive->node);
752 0 : mutex_unlock(&xgmi_mutex);
753 :
754 : amdgpu_put_xgmi_hive(hive);
755 : }
756 :
757 : return 0;
758 : }
759 :
760 0 : static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
761 : {
762 0 : if (!adev->gmc.xgmi.supported ||
763 0 : adev->gmc.xgmi.num_physical_nodes == 0)
764 : return 0;
765 :
766 0 : adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev);
767 :
768 0 : return amdgpu_ras_block_late_init(adev, ras_block);
769 : }
770 :
771 0 : uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
772 : uint64_t addr)
773 : {
774 0 : struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi;
775 0 : return (addr + xgmi->physical_node_id * xgmi->node_segment_size);
776 : }
777 :
778 : static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg)
779 : {
780 0 : WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF);
781 0 : WREG32_PCIE(pcs_status_reg, 0);
782 : }
783 :
784 0 : static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
785 : {
786 : uint32_t i;
787 :
788 0 : switch (adev->asic_type) {
789 : case CHIP_ARCTURUS:
790 0 : for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++)
791 0 : pcs_clear_status(adev,
792 0 : xgmi_pcs_err_status_reg_arct[i]);
793 : break;
794 : case CHIP_VEGA20:
795 0 : for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++)
796 0 : pcs_clear_status(adev,
797 0 : xgmi_pcs_err_status_reg_vg20[i]);
798 : break;
799 : case CHIP_ALDEBARAN:
800 0 : for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++)
801 0 : pcs_clear_status(adev,
802 0 : xgmi3x16_pcs_err_status_reg_aldebaran[i]);
803 0 : for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++)
804 0 : pcs_clear_status(adev,
805 0 : walf_pcs_err_status_reg_aldebaran[i]);
806 : break;
807 : default:
808 : break;
809 : }
810 0 : }
811 :
812 0 : static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
813 : uint32_t value,
814 : uint32_t *ue_count,
815 : uint32_t *ce_count,
816 : bool is_xgmi_pcs)
817 : {
818 : int i;
819 : int ue_cnt;
820 :
821 0 : if (is_xgmi_pcs) {
822 : /* query xgmi pcs error status,
823 : * only ue is supported */
824 0 : for (i = 0; i < ARRAY_SIZE(xgmi_pcs_ras_fields); i ++) {
825 0 : ue_cnt = (value &
826 0 : xgmi_pcs_ras_fields[i].pcs_err_mask) >>
827 0 : xgmi_pcs_ras_fields[i].pcs_err_shift;
828 0 : if (ue_cnt) {
829 0 : dev_info(adev->dev, "%s detected\n",
830 : xgmi_pcs_ras_fields[i].err_name);
831 0 : *ue_count += ue_cnt;
832 : }
833 : }
834 : } else {
835 : /* query wafl pcs error status,
836 : * only ue is supported */
837 0 : for (i = 0; i < ARRAY_SIZE(wafl_pcs_ras_fields); i++) {
838 0 : ue_cnt = (value &
839 0 : wafl_pcs_ras_fields[i].pcs_err_mask) >>
840 0 : wafl_pcs_ras_fields[i].pcs_err_shift;
841 0 : if (ue_cnt) {
842 0 : dev_info(adev->dev, "%s detected\n",
843 : wafl_pcs_ras_fields[i].err_name);
844 0 : *ue_count += ue_cnt;
845 : }
846 : }
847 : }
848 :
849 0 : return 0;
850 : }
851 :
852 0 : static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
853 : void *ras_error_status)
854 : {
855 0 : struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
856 : int i;
857 : uint32_t data;
858 0 : uint32_t ue_cnt = 0, ce_cnt = 0;
859 :
860 0 : if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL))
861 0 : return ;
862 :
863 0 : err_data->ue_count = 0;
864 0 : err_data->ce_count = 0;
865 :
866 0 : switch (adev->asic_type) {
867 : case CHIP_ARCTURUS:
868 : /* check xgmi pcs error */
869 0 : for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) {
870 0 : data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]);
871 0 : if (data)
872 0 : amdgpu_xgmi_query_pcs_error_status(adev,
873 : data, &ue_cnt, &ce_cnt, true);
874 : }
875 : /* check wafl pcs error */
876 0 : for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) {
877 0 : data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]);
878 0 : if (data)
879 0 : amdgpu_xgmi_query_pcs_error_status(adev,
880 : data, &ue_cnt, &ce_cnt, false);
881 : }
882 : break;
883 : case CHIP_VEGA20:
884 : /* check xgmi pcs error */
885 0 : for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) {
886 0 : data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]);
887 0 : if (data)
888 0 : amdgpu_xgmi_query_pcs_error_status(adev,
889 : data, &ue_cnt, &ce_cnt, true);
890 : }
891 : /* check wafl pcs error */
892 0 : for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) {
893 0 : data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]);
894 0 : if (data)
895 0 : amdgpu_xgmi_query_pcs_error_status(adev,
896 : data, &ue_cnt, &ce_cnt, false);
897 : }
898 : break;
899 : case CHIP_ALDEBARAN:
900 : /* check xgmi3x16 pcs error */
901 0 : for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) {
902 0 : data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]);
903 0 : if (data)
904 0 : amdgpu_xgmi_query_pcs_error_status(adev,
905 : data, &ue_cnt, &ce_cnt, true);
906 : }
907 : /* check wafl pcs error */
908 0 : for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) {
909 0 : data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i]);
910 0 : if (data)
911 0 : amdgpu_xgmi_query_pcs_error_status(adev,
912 : data, &ue_cnt, &ce_cnt, false);
913 : }
914 : break;
915 : default:
916 0 : dev_warn(adev->dev, "XGMI RAS error query not supported");
917 0 : break;
918 : }
919 :
920 0 : adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev);
921 :
922 0 : err_data->ue_count += ue_cnt;
923 0 : err_data->ce_count += ce_cnt;
924 : }
925 :
926 : /* Trigger XGMI/WAFL error */
927 0 : static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, void *inject_if)
928 : {
929 0 : int ret = 0;
930 0 : struct ta_ras_trigger_error_input *block_info =
931 : (struct ta_ras_trigger_error_input *)inject_if;
932 :
933 0 : if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
934 0 : dev_warn(adev->dev, "Failed to disallow df cstate");
935 :
936 0 : if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
937 0 : dev_warn(adev->dev, "Failed to disallow XGMI power down");
938 :
939 0 : ret = psp_ras_trigger_error(&adev->psp, block_info);
940 :
941 0 : if (amdgpu_ras_intr_triggered())
942 : return ret;
943 :
944 0 : if (amdgpu_dpm_allow_xgmi_power_down(adev, true))
945 0 : dev_warn(adev->dev, "Failed to allow XGMI power down");
946 :
947 0 : if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
948 0 : dev_warn(adev->dev, "Failed to allow df cstate");
949 :
950 : return ret;
951 : }
952 :
953 : struct amdgpu_ras_block_hw_ops xgmi_ras_hw_ops = {
954 : .query_ras_error_count = amdgpu_xgmi_query_ras_error_count,
955 : .reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count,
956 : .ras_error_inject = amdgpu_ras_error_inject_xgmi,
957 : };
958 :
959 : struct amdgpu_xgmi_ras xgmi_ras = {
960 : .ras_block = {
961 : .ras_comm = {
962 : .name = "xgmi_wafl",
963 : .block = AMDGPU_RAS_BLOCK__XGMI_WAFL,
964 : .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
965 : },
966 : .hw_ops = &xgmi_ras_hw_ops,
967 : .ras_late_init = amdgpu_xgmi_ras_late_init,
968 : },
969 : };
|