Line data Source code
1 : /*
2 : * Copyright 2008 Advanced Micro Devices, Inc.
3 : * Copyright 2008 Red Hat Inc.
4 : * Copyright 2009 Jerome Glisse.
5 : *
6 : * Permission is hereby granted, free of charge, to any person obtaining a
7 : * copy of this software and associated documentation files (the "Software"),
8 : * to deal in the Software without restriction, including without limitation
9 : * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 : * and/or sell copies of the Software, and to permit persons to whom the
11 : * Software is furnished to do so, subject to the following conditions:
12 : *
13 : * The above copyright notice and this permission notice shall be included in
14 : * all copies or substantial portions of the Software.
15 : *
16 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 : * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 : * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 : * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 : * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 : * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 : * OTHER DEALINGS IN THE SOFTWARE.
23 : *
24 : * Authors: Dave Airlie
25 : * Alex Deucher
26 : * Jerome Glisse
27 : */
28 : #include <linux/power_supply.h>
29 : #include <linux/kthread.h>
30 : #include <linux/module.h>
31 : #include <linux/console.h>
32 : #include <linux/slab.h>
33 : #include <linux/iommu.h>
34 : #include <linux/pci.h>
35 : #include <linux/devcoredump.h>
36 : #include <generated/utsrelease.h>
37 : #include <linux/pci-p2pdma.h>
38 :
39 : #include <drm/drm_atomic_helper.h>
40 : #include <drm/drm_probe_helper.h>
41 : #include <drm/amdgpu_drm.h>
42 : #include <linux/vgaarb.h>
43 : #include <linux/vga_switcheroo.h>
44 : #include <linux/efi.h>
45 : #include "amdgpu.h"
46 : #include "amdgpu_trace.h"
47 : #include "amdgpu_i2c.h"
48 : #include "atom.h"
49 : #include "amdgpu_atombios.h"
50 : #include "amdgpu_atomfirmware.h"
51 : #include "amd_pcie.h"
52 : #ifdef CONFIG_DRM_AMDGPU_SI
53 : #include "si.h"
54 : #endif
55 : #ifdef CONFIG_DRM_AMDGPU_CIK
56 : #include "cik.h"
57 : #endif
58 : #include "vi.h"
59 : #include "soc15.h"
60 : #include "nv.h"
61 : #include "bif/bif_4_1_d.h"
62 : #include <linux/firmware.h>
63 : #include "amdgpu_vf_error.h"
64 :
65 : #include "amdgpu_amdkfd.h"
66 : #include "amdgpu_pm.h"
67 :
68 : #include "amdgpu_xgmi.h"
69 : #include "amdgpu_ras.h"
70 : #include "amdgpu_pmu.h"
71 : #include "amdgpu_fru_eeprom.h"
72 : #include "amdgpu_reset.h"
73 :
74 : #include <linux/suspend.h>
75 : #include <drm/task_barrier.h>
76 : #include <linux/pm_runtime.h>
77 :
78 : #include <drm/drm_drv.h>
79 :
80 : MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
81 : MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
82 : MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
83 : MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
84 : MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
85 : MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
86 : MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
87 :
88 : #define AMDGPU_RESUME_MS 2000
89 : #define AMDGPU_MAX_RETRY_LIMIT 2
90 : #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
91 :
92 : const char *amdgpu_asic_name[] = {
93 : "TAHITI",
94 : "PITCAIRN",
95 : "VERDE",
96 : "OLAND",
97 : "HAINAN",
98 : "BONAIRE",
99 : "KAVERI",
100 : "KABINI",
101 : "HAWAII",
102 : "MULLINS",
103 : "TOPAZ",
104 : "TONGA",
105 : "FIJI",
106 : "CARRIZO",
107 : "STONEY",
108 : "POLARIS10",
109 : "POLARIS11",
110 : "POLARIS12",
111 : "VEGAM",
112 : "VEGA10",
113 : "VEGA12",
114 : "VEGA20",
115 : "RAVEN",
116 : "ARCTURUS",
117 : "RENOIR",
118 : "ALDEBARAN",
119 : "NAVI10",
120 : "CYAN_SKILLFISH",
121 : "NAVI14",
122 : "NAVI12",
123 : "SIENNA_CICHLID",
124 : "NAVY_FLOUNDER",
125 : "VANGOGH",
126 : "DIMGREY_CAVEFISH",
127 : "BEIGE_GOBY",
128 : "YELLOW_CARP",
129 : "IP DISCOVERY",
130 : "LAST",
131 : };
132 :
133 : /**
134 : * DOC: pcie_replay_count
135 : *
136 : * The amdgpu driver provides a sysfs API for reporting the total number
137 : * of PCIe replays (NAKs)
138 : * The file pcie_replay_count is used for this and returns the total
139 : * number of replays as a sum of the NAKs generated and NAKs received
140 : */
141 :
142 0 : static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
143 : struct device_attribute *attr, char *buf)
144 : {
145 0 : struct drm_device *ddev = dev_get_drvdata(dev);
146 0 : struct amdgpu_device *adev = drm_to_adev(ddev);
147 0 : uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
148 :
149 0 : return sysfs_emit(buf, "%llu\n", cnt);
150 : }
151 :
152 : static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
153 : amdgpu_device_get_pcie_replay_count, NULL);
154 :
155 : static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
156 :
157 : /**
158 : * DOC: product_name
159 : *
160 : * The amdgpu driver provides a sysfs API for reporting the product name
161 : * for the device
162 : * The file serial_number is used for this and returns the product name
163 : * as returned from the FRU.
164 : * NOTE: This is only available for certain server cards
165 : */
166 :
167 0 : static ssize_t amdgpu_device_get_product_name(struct device *dev,
168 : struct device_attribute *attr, char *buf)
169 : {
170 0 : struct drm_device *ddev = dev_get_drvdata(dev);
171 0 : struct amdgpu_device *adev = drm_to_adev(ddev);
172 :
173 0 : return sysfs_emit(buf, "%s\n", adev->product_name);
174 : }
175 :
176 : static DEVICE_ATTR(product_name, S_IRUGO,
177 : amdgpu_device_get_product_name, NULL);
178 :
179 : /**
180 : * DOC: product_number
181 : *
182 : * The amdgpu driver provides a sysfs API for reporting the part number
183 : * for the device
184 : * The file serial_number is used for this and returns the part number
185 : * as returned from the FRU.
186 : * NOTE: This is only available for certain server cards
187 : */
188 :
189 0 : static ssize_t amdgpu_device_get_product_number(struct device *dev,
190 : struct device_attribute *attr, char *buf)
191 : {
192 0 : struct drm_device *ddev = dev_get_drvdata(dev);
193 0 : struct amdgpu_device *adev = drm_to_adev(ddev);
194 :
195 0 : return sysfs_emit(buf, "%s\n", adev->product_number);
196 : }
197 :
198 : static DEVICE_ATTR(product_number, S_IRUGO,
199 : amdgpu_device_get_product_number, NULL);
200 :
201 : /**
202 : * DOC: serial_number
203 : *
204 : * The amdgpu driver provides a sysfs API for reporting the serial number
205 : * for the device
206 : * The file serial_number is used for this and returns the serial number
207 : * as returned from the FRU.
208 : * NOTE: This is only available for certain server cards
209 : */
210 :
211 0 : static ssize_t amdgpu_device_get_serial_number(struct device *dev,
212 : struct device_attribute *attr, char *buf)
213 : {
214 0 : struct drm_device *ddev = dev_get_drvdata(dev);
215 0 : struct amdgpu_device *adev = drm_to_adev(ddev);
216 :
217 0 : return sysfs_emit(buf, "%s\n", adev->serial);
218 : }
219 :
220 : static DEVICE_ATTR(serial_number, S_IRUGO,
221 : amdgpu_device_get_serial_number, NULL);
222 :
223 : /**
224 : * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
225 : *
226 : * @dev: drm_device pointer
227 : *
228 : * Returns true if the device is a dGPU with ATPX power control,
229 : * otherwise return false.
230 : */
231 0 : bool amdgpu_device_supports_px(struct drm_device *dev)
232 : {
233 0 : struct amdgpu_device *adev = drm_to_adev(dev);
234 :
235 0 : if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
236 : return true;
237 : return false;
238 : }
239 :
240 : /**
241 : * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
242 : *
243 : * @dev: drm_device pointer
244 : *
245 : * Returns true if the device is a dGPU with ACPI power control,
246 : * otherwise return false.
247 : */
248 0 : bool amdgpu_device_supports_boco(struct drm_device *dev)
249 : {
250 0 : struct amdgpu_device *adev = drm_to_adev(dev);
251 :
252 0 : if (adev->has_pr3 ||
253 : ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
254 : return true;
255 0 : return false;
256 : }
257 :
258 : /**
259 : * amdgpu_device_supports_baco - Does the device support BACO
260 : *
261 : * @dev: drm_device pointer
262 : *
263 : * Returns true if the device supporte BACO,
264 : * otherwise return false.
265 : */
266 0 : bool amdgpu_device_supports_baco(struct drm_device *dev)
267 : {
268 0 : struct amdgpu_device *adev = drm_to_adev(dev);
269 :
270 0 : return amdgpu_asic_supports_baco(adev);
271 : }
272 :
273 : /**
274 : * amdgpu_device_supports_smart_shift - Is the device dGPU with
275 : * smart shift support
276 : *
277 : * @dev: drm_device pointer
278 : *
279 : * Returns true if the device is a dGPU with Smart Shift support,
280 : * otherwise returns false.
281 : */
282 0 : bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
283 : {
284 0 : return (amdgpu_device_supports_boco(dev) &&
285 : amdgpu_acpi_is_power_shift_control_supported());
286 : }
287 :
288 : /*
289 : * VRAM access helper functions
290 : */
291 :
292 : /**
293 : * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
294 : *
295 : * @adev: amdgpu_device pointer
296 : * @pos: offset of the buffer in vram
297 : * @buf: virtual address of the buffer in system memory
298 : * @size: read/write size, sizeof(@buf) must > @size
299 : * @write: true - write to vram, otherwise - read from vram
300 : */
301 0 : void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
302 : void *buf, size_t size, bool write)
303 : {
304 : unsigned long flags;
305 0 : uint32_t hi = ~0, tmp = 0;
306 0 : uint32_t *data = buf;
307 : uint64_t last;
308 : int idx;
309 :
310 0 : if (!drm_dev_enter(adev_to_drm(adev), &idx))
311 0 : return;
312 :
313 0 : BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
314 :
315 0 : spin_lock_irqsave(&adev->mmio_idx_lock, flags);
316 0 : for (last = pos + size; pos < last; pos += 4) {
317 0 : tmp = pos >> 31;
318 :
319 0 : WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
320 0 : if (tmp != hi) {
321 0 : WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
322 0 : hi = tmp;
323 : }
324 0 : if (write)
325 0 : WREG32_NO_KIQ(mmMM_DATA, *data++);
326 : else
327 0 : *data++ = RREG32_NO_KIQ(mmMM_DATA);
328 : }
329 :
330 0 : spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
331 0 : drm_dev_exit(idx);
332 : }
333 :
334 : /**
335 : * amdgpu_device_aper_access - access vram by vram aperature
336 : *
337 : * @adev: amdgpu_device pointer
338 : * @pos: offset of the buffer in vram
339 : * @buf: virtual address of the buffer in system memory
340 : * @size: read/write size, sizeof(@buf) must > @size
341 : * @write: true - write to vram, otherwise - read from vram
342 : *
343 : * The return value means how many bytes have been transferred.
344 : */
345 0 : size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
346 : void *buf, size_t size, bool write)
347 : {
348 : #ifdef CONFIG_64BIT
349 : void __iomem *addr;
350 0 : size_t count = 0;
351 : uint64_t last;
352 :
353 0 : if (!adev->mman.aper_base_kaddr)
354 : return 0;
355 :
356 0 : last = min(pos + size, adev->gmc.visible_vram_size);
357 0 : if (last > pos) {
358 0 : addr = adev->mman.aper_base_kaddr + pos;
359 0 : count = last - pos;
360 :
361 0 : if (write) {
362 0 : memcpy_toio(addr, buf, count);
363 0 : mb();
364 0 : amdgpu_device_flush_hdp(adev, NULL);
365 : } else {
366 0 : amdgpu_device_invalidate_hdp(adev, NULL);
367 0 : mb();
368 0 : memcpy_fromio(buf, addr, count);
369 : }
370 :
371 : }
372 :
373 : return count;
374 : #else
375 : return 0;
376 : #endif
377 : }
378 :
379 : /**
380 : * amdgpu_device_vram_access - read/write a buffer in vram
381 : *
382 : * @adev: amdgpu_device pointer
383 : * @pos: offset of the buffer in vram
384 : * @buf: virtual address of the buffer in system memory
385 : * @size: read/write size, sizeof(@buf) must > @size
386 : * @write: true - write to vram, otherwise - read from vram
387 : */
388 0 : void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
389 : void *buf, size_t size, bool write)
390 : {
391 : size_t count;
392 :
393 : /* try to using vram apreature to access vram first */
394 0 : count = amdgpu_device_aper_access(adev, pos, buf, size, write);
395 0 : size -= count;
396 0 : if (size) {
397 : /* using MM to access rest vram */
398 0 : pos += count;
399 0 : buf += count;
400 0 : amdgpu_device_mm_access(adev, pos, buf, size, write);
401 : }
402 0 : }
403 :
404 : /*
405 : * register access helper functions.
406 : */
407 :
408 : /* Check if hw access should be skipped because of hotplug or device error */
409 0 : bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
410 : {
411 0 : if (adev->no_hw_access)
412 : return true;
413 :
414 : #ifdef CONFIG_LOCKDEP
415 : /*
416 : * This is a bit complicated to understand, so worth a comment. What we assert
417 : * here is that the GPU reset is not running on another thread in parallel.
418 : *
419 : * For this we trylock the read side of the reset semaphore, if that succeeds
420 : * we know that the reset is not running in paralell.
421 : *
422 : * If the trylock fails we assert that we are either already holding the read
423 : * side of the lock or are the reset thread itself and hold the write side of
424 : * the lock.
425 : */
426 : if (in_task()) {
427 : if (down_read_trylock(&adev->reset_domain->sem))
428 : up_read(&adev->reset_domain->sem);
429 : else
430 : lockdep_assert_held(&adev->reset_domain->sem);
431 : }
432 : #endif
433 0 : return false;
434 : }
435 :
436 : /**
437 : * amdgpu_device_rreg - read a memory mapped IO or indirect register
438 : *
439 : * @adev: amdgpu_device pointer
440 : * @reg: dword aligned register offset
441 : * @acc_flags: access flags which require special behavior
442 : *
443 : * Returns the 32 bit value from the offset specified.
444 : */
445 0 : uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
446 : uint32_t reg, uint32_t acc_flags)
447 : {
448 : uint32_t ret;
449 :
450 0 : if (amdgpu_device_skip_hw_access(adev))
451 : return 0;
452 :
453 0 : if ((reg * 4) < adev->rmmio_size) {
454 0 : if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
455 0 : amdgpu_sriov_runtime(adev) &&
456 0 : down_read_trylock(&adev->reset_domain->sem)) {
457 0 : ret = amdgpu_kiq_rreg(adev, reg);
458 0 : up_read(&adev->reset_domain->sem);
459 : } else {
460 0 : ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
461 : }
462 : } else {
463 0 : ret = adev->pcie_rreg(adev, reg * 4);
464 : }
465 :
466 0 : trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
467 :
468 0 : return ret;
469 : }
470 :
471 : /*
472 : * MMIO register read with bytes helper functions
473 : * @offset:bytes offset from MMIO start
474 : *
475 : */
476 :
477 : /**
478 : * amdgpu_mm_rreg8 - read a memory mapped IO register
479 : *
480 : * @adev: amdgpu_device pointer
481 : * @offset: byte aligned register offset
482 : *
483 : * Returns the 8 bit value from the offset specified.
484 : */
485 0 : uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
486 : {
487 0 : if (amdgpu_device_skip_hw_access(adev))
488 : return 0;
489 :
490 0 : if (offset < adev->rmmio_size)
491 0 : return (readb(adev->rmmio + offset));
492 0 : BUG();
493 : }
494 :
495 : /*
496 : * MMIO register write with bytes helper functions
497 : * @offset:bytes offset from MMIO start
498 : * @value: the value want to be written to the register
499 : *
500 : */
501 : /**
502 : * amdgpu_mm_wreg8 - read a memory mapped IO register
503 : *
504 : * @adev: amdgpu_device pointer
505 : * @offset: byte aligned register offset
506 : * @value: 8 bit value to write
507 : *
508 : * Writes the value specified to the offset specified.
509 : */
510 0 : void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
511 : {
512 0 : if (amdgpu_device_skip_hw_access(adev))
513 : return;
514 :
515 0 : if (offset < adev->rmmio_size)
516 0 : writeb(value, adev->rmmio + offset);
517 : else
518 0 : BUG();
519 : }
520 :
521 : /**
522 : * amdgpu_device_wreg - write to a memory mapped IO or indirect register
523 : *
524 : * @adev: amdgpu_device pointer
525 : * @reg: dword aligned register offset
526 : * @v: 32 bit value to write to the register
527 : * @acc_flags: access flags which require special behavior
528 : *
529 : * Writes the value specified to the offset specified.
530 : */
531 0 : void amdgpu_device_wreg(struct amdgpu_device *adev,
532 : uint32_t reg, uint32_t v,
533 : uint32_t acc_flags)
534 : {
535 0 : if (amdgpu_device_skip_hw_access(adev))
536 : return;
537 :
538 0 : if ((reg * 4) < adev->rmmio_size) {
539 0 : if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
540 0 : amdgpu_sriov_runtime(adev) &&
541 0 : down_read_trylock(&adev->reset_domain->sem)) {
542 0 : amdgpu_kiq_wreg(adev, reg, v);
543 0 : up_read(&adev->reset_domain->sem);
544 : } else {
545 0 : writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
546 : }
547 : } else {
548 0 : adev->pcie_wreg(adev, reg * 4, v);
549 : }
550 :
551 0 : trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
552 : }
553 :
554 : /**
555 : * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
556 : *
557 : * @adev: amdgpu_device pointer
558 : * @reg: mmio/rlc register
559 : * @v: value to write
560 : *
561 : * this function is invoked only for the debugfs register access
562 : */
563 0 : void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
564 : uint32_t reg, uint32_t v)
565 : {
566 0 : if (amdgpu_device_skip_hw_access(adev))
567 : return;
568 :
569 0 : if (amdgpu_sriov_fullaccess(adev) &&
570 0 : adev->gfx.rlc.funcs &&
571 0 : adev->gfx.rlc.funcs->is_rlcg_access_range) {
572 0 : if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
573 0 : return amdgpu_sriov_wreg(adev, reg, v, 0, 0);
574 0 : } else if ((reg * 4) >= adev->rmmio_size) {
575 0 : adev->pcie_wreg(adev, reg * 4, v);
576 : } else {
577 0 : writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
578 : }
579 : }
580 :
581 : /**
582 : * amdgpu_mm_rdoorbell - read a doorbell dword
583 : *
584 : * @adev: amdgpu_device pointer
585 : * @index: doorbell index
586 : *
587 : * Returns the value in the doorbell aperture at the
588 : * requested doorbell index (CIK).
589 : */
590 0 : u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
591 : {
592 0 : if (amdgpu_device_skip_hw_access(adev))
593 : return 0;
594 :
595 0 : if (index < adev->doorbell.num_doorbells) {
596 0 : return readl(adev->doorbell.ptr + index);
597 : } else {
598 0 : DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
599 0 : return 0;
600 : }
601 : }
602 :
603 : /**
604 : * amdgpu_mm_wdoorbell - write a doorbell dword
605 : *
606 : * @adev: amdgpu_device pointer
607 : * @index: doorbell index
608 : * @v: value to write
609 : *
610 : * Writes @v to the doorbell aperture at the
611 : * requested doorbell index (CIK).
612 : */
613 0 : void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
614 : {
615 0 : if (amdgpu_device_skip_hw_access(adev))
616 : return;
617 :
618 0 : if (index < adev->doorbell.num_doorbells) {
619 0 : writel(v, adev->doorbell.ptr + index);
620 : } else {
621 0 : DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
622 : }
623 : }
624 :
625 : /**
626 : * amdgpu_mm_rdoorbell64 - read a doorbell Qword
627 : *
628 : * @adev: amdgpu_device pointer
629 : * @index: doorbell index
630 : *
631 : * Returns the value in the doorbell aperture at the
632 : * requested doorbell index (VEGA10+).
633 : */
634 0 : u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
635 : {
636 0 : if (amdgpu_device_skip_hw_access(adev))
637 : return 0;
638 :
639 0 : if (index < adev->doorbell.num_doorbells) {
640 0 : return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
641 : } else {
642 0 : DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
643 0 : return 0;
644 : }
645 : }
646 :
647 : /**
648 : * amdgpu_mm_wdoorbell64 - write a doorbell Qword
649 : *
650 : * @adev: amdgpu_device pointer
651 : * @index: doorbell index
652 : * @v: value to write
653 : *
654 : * Writes @v to the doorbell aperture at the
655 : * requested doorbell index (VEGA10+).
656 : */
657 0 : void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
658 : {
659 0 : if (amdgpu_device_skip_hw_access(adev))
660 : return;
661 :
662 0 : if (index < adev->doorbell.num_doorbells) {
663 0 : atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
664 : } else {
665 0 : DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
666 : }
667 : }
668 :
669 : /**
670 : * amdgpu_device_indirect_rreg - read an indirect register
671 : *
672 : * @adev: amdgpu_device pointer
673 : * @pcie_index: mmio register offset
674 : * @pcie_data: mmio register offset
675 : * @reg_addr: indirect register address to read from
676 : *
677 : * Returns the value of indirect register @reg_addr
678 : */
679 0 : u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
680 : u32 pcie_index, u32 pcie_data,
681 : u32 reg_addr)
682 : {
683 : unsigned long flags;
684 : u32 r;
685 : void __iomem *pcie_index_offset;
686 : void __iomem *pcie_data_offset;
687 :
688 0 : spin_lock_irqsave(&adev->pcie_idx_lock, flags);
689 0 : pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
690 0 : pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
691 :
692 0 : writel(reg_addr, pcie_index_offset);
693 0 : readl(pcie_index_offset);
694 0 : r = readl(pcie_data_offset);
695 0 : spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
696 :
697 0 : return r;
698 : }
699 :
700 : /**
701 : * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
702 : *
703 : * @adev: amdgpu_device pointer
704 : * @pcie_index: mmio register offset
705 : * @pcie_data: mmio register offset
706 : * @reg_addr: indirect register address to read from
707 : *
708 : * Returns the value of indirect register @reg_addr
709 : */
710 0 : u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
711 : u32 pcie_index, u32 pcie_data,
712 : u32 reg_addr)
713 : {
714 : unsigned long flags;
715 : u64 r;
716 : void __iomem *pcie_index_offset;
717 : void __iomem *pcie_data_offset;
718 :
719 0 : spin_lock_irqsave(&adev->pcie_idx_lock, flags);
720 0 : pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
721 0 : pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
722 :
723 : /* read low 32 bits */
724 0 : writel(reg_addr, pcie_index_offset);
725 0 : readl(pcie_index_offset);
726 0 : r = readl(pcie_data_offset);
727 : /* read high 32 bits */
728 0 : writel(reg_addr + 4, pcie_index_offset);
729 0 : readl(pcie_index_offset);
730 0 : r |= ((u64)readl(pcie_data_offset) << 32);
731 0 : spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
732 :
733 0 : return r;
734 : }
735 :
736 : /**
737 : * amdgpu_device_indirect_wreg - write an indirect register address
738 : *
739 : * @adev: amdgpu_device pointer
740 : * @pcie_index: mmio register offset
741 : * @pcie_data: mmio register offset
742 : * @reg_addr: indirect register offset
743 : * @reg_data: indirect register data
744 : *
745 : */
746 0 : void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
747 : u32 pcie_index, u32 pcie_data,
748 : u32 reg_addr, u32 reg_data)
749 : {
750 : unsigned long flags;
751 : void __iomem *pcie_index_offset;
752 : void __iomem *pcie_data_offset;
753 :
754 0 : spin_lock_irqsave(&adev->pcie_idx_lock, flags);
755 0 : pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
756 0 : pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
757 :
758 0 : writel(reg_addr, pcie_index_offset);
759 0 : readl(pcie_index_offset);
760 0 : writel(reg_data, pcie_data_offset);
761 0 : readl(pcie_data_offset);
762 0 : spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
763 0 : }
764 :
765 : /**
766 : * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
767 : *
768 : * @adev: amdgpu_device pointer
769 : * @pcie_index: mmio register offset
770 : * @pcie_data: mmio register offset
771 : * @reg_addr: indirect register offset
772 : * @reg_data: indirect register data
773 : *
774 : */
775 0 : void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
776 : u32 pcie_index, u32 pcie_data,
777 : u32 reg_addr, u64 reg_data)
778 : {
779 : unsigned long flags;
780 : void __iomem *pcie_index_offset;
781 : void __iomem *pcie_data_offset;
782 :
783 0 : spin_lock_irqsave(&adev->pcie_idx_lock, flags);
784 0 : pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
785 0 : pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
786 :
787 : /* write low 32 bits */
788 0 : writel(reg_addr, pcie_index_offset);
789 0 : readl(pcie_index_offset);
790 0 : writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
791 0 : readl(pcie_data_offset);
792 : /* write high 32 bits */
793 0 : writel(reg_addr + 4, pcie_index_offset);
794 0 : readl(pcie_index_offset);
795 0 : writel((u32)(reg_data >> 32), pcie_data_offset);
796 0 : readl(pcie_data_offset);
797 0 : spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
798 0 : }
799 :
800 : /**
801 : * amdgpu_invalid_rreg - dummy reg read function
802 : *
803 : * @adev: amdgpu_device pointer
804 : * @reg: offset of register
805 : *
806 : * Dummy register read function. Used for register blocks
807 : * that certain asics don't have (all asics).
808 : * Returns the value in the register.
809 : */
810 0 : static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
811 : {
812 0 : DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
813 0 : BUG();
814 : return 0;
815 : }
816 :
817 : /**
818 : * amdgpu_invalid_wreg - dummy reg write function
819 : *
820 : * @adev: amdgpu_device pointer
821 : * @reg: offset of register
822 : * @v: value to write to the register
823 : *
824 : * Dummy register read function. Used for register blocks
825 : * that certain asics don't have (all asics).
826 : */
827 0 : static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
828 : {
829 0 : DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
830 : reg, v);
831 0 : BUG();
832 : }
833 :
834 : /**
835 : * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
836 : *
837 : * @adev: amdgpu_device pointer
838 : * @reg: offset of register
839 : *
840 : * Dummy register read function. Used for register blocks
841 : * that certain asics don't have (all asics).
842 : * Returns the value in the register.
843 : */
844 0 : static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
845 : {
846 0 : DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
847 0 : BUG();
848 : return 0;
849 : }
850 :
851 : /**
852 : * amdgpu_invalid_wreg64 - dummy reg write function
853 : *
854 : * @adev: amdgpu_device pointer
855 : * @reg: offset of register
856 : * @v: value to write to the register
857 : *
858 : * Dummy register read function. Used for register blocks
859 : * that certain asics don't have (all asics).
860 : */
861 0 : static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
862 : {
863 0 : DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
864 : reg, v);
865 0 : BUG();
866 : }
867 :
868 : /**
869 : * amdgpu_block_invalid_rreg - dummy reg read function
870 : *
871 : * @adev: amdgpu_device pointer
872 : * @block: offset of instance
873 : * @reg: offset of register
874 : *
875 : * Dummy register read function. Used for register blocks
876 : * that certain asics don't have (all asics).
877 : * Returns the value in the register.
878 : */
879 0 : static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
880 : uint32_t block, uint32_t reg)
881 : {
882 0 : DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
883 : reg, block);
884 0 : BUG();
885 : return 0;
886 : }
887 :
888 : /**
889 : * amdgpu_block_invalid_wreg - dummy reg write function
890 : *
891 : * @adev: amdgpu_device pointer
892 : * @block: offset of instance
893 : * @reg: offset of register
894 : * @v: value to write to the register
895 : *
896 : * Dummy register read function. Used for register blocks
897 : * that certain asics don't have (all asics).
898 : */
899 0 : static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
900 : uint32_t block,
901 : uint32_t reg, uint32_t v)
902 : {
903 0 : DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
904 : reg, block, v);
905 0 : BUG();
906 : }
907 :
908 : /**
909 : * amdgpu_device_asic_init - Wrapper for atom asic_init
910 : *
911 : * @adev: amdgpu_device pointer
912 : *
913 : * Does any asic specific work and then calls atom asic init.
914 : */
915 0 : static int amdgpu_device_asic_init(struct amdgpu_device *adev)
916 : {
917 0 : amdgpu_asic_pre_asic_init(adev);
918 :
919 0 : if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0))
920 0 : return amdgpu_atomfirmware_asic_init(adev, true);
921 : else
922 0 : return amdgpu_atom_asic_init(adev->mode_info.atom_context);
923 : }
924 :
925 : /**
926 : * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
927 : *
928 : * @adev: amdgpu_device pointer
929 : *
930 : * Allocates a scratch page of VRAM for use by various things in the
931 : * driver.
932 : */
933 : static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
934 : {
935 0 : return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
936 : PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
937 : &adev->vram_scratch.robj,
938 : &adev->vram_scratch.gpu_addr,
939 0 : (void **)&adev->vram_scratch.ptr);
940 : }
941 :
942 : /**
943 : * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
944 : *
945 : * @adev: amdgpu_device pointer
946 : *
947 : * Frees the VRAM scratch page.
948 : */
949 : static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
950 : {
951 0 : amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
952 : }
953 :
954 : /**
955 : * amdgpu_device_program_register_sequence - program an array of registers.
956 : *
957 : * @adev: amdgpu_device pointer
958 : * @registers: pointer to the register array
959 : * @array_size: size of the register array
960 : *
961 : * Programs an array or registers with and and or masks.
962 : * This is a helper for setting golden registers.
963 : */
964 0 : void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
965 : const u32 *registers,
966 : const u32 array_size)
967 : {
968 : u32 tmp, reg, and_mask, or_mask;
969 : int i;
970 :
971 0 : if (array_size % 3)
972 : return;
973 :
974 0 : for (i = 0; i < array_size; i +=3) {
975 0 : reg = registers[i + 0];
976 0 : and_mask = registers[i + 1];
977 0 : or_mask = registers[i + 2];
978 :
979 0 : if (and_mask == 0xffffffff) {
980 : tmp = or_mask;
981 : } else {
982 0 : tmp = RREG32(reg);
983 0 : tmp &= ~and_mask;
984 0 : if (adev->family >= AMDGPU_FAMILY_AI)
985 0 : tmp |= (or_mask & and_mask);
986 : else
987 0 : tmp |= or_mask;
988 : }
989 0 : WREG32(reg, tmp);
990 : }
991 : }
992 :
993 : /**
994 : * amdgpu_device_pci_config_reset - reset the GPU
995 : *
996 : * @adev: amdgpu_device pointer
997 : *
998 : * Resets the GPU using the pci config reset sequence.
999 : * Only applicable to asics prior to vega10.
1000 : */
1001 0 : void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1002 : {
1003 0 : pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1004 0 : }
1005 :
1006 : /**
1007 : * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1008 : *
1009 : * @adev: amdgpu_device pointer
1010 : *
1011 : * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1012 : */
1013 0 : int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1014 : {
1015 0 : return pci_reset_function(adev->pdev);
1016 : }
1017 :
1018 : /*
1019 : * GPU doorbell aperture helpers function.
1020 : */
1021 : /**
1022 : * amdgpu_device_doorbell_init - Init doorbell driver information.
1023 : *
1024 : * @adev: amdgpu_device pointer
1025 : *
1026 : * Init doorbell driver information (CIK)
1027 : * Returns 0 on success, error on failure.
1028 : */
1029 0 : static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
1030 : {
1031 :
1032 : /* No doorbell on SI hardware generation */
1033 0 : if (adev->asic_type < CHIP_BONAIRE) {
1034 0 : adev->doorbell.base = 0;
1035 0 : adev->doorbell.size = 0;
1036 0 : adev->doorbell.num_doorbells = 0;
1037 0 : adev->doorbell.ptr = NULL;
1038 0 : return 0;
1039 : }
1040 :
1041 0 : if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
1042 : return -EINVAL;
1043 :
1044 0 : amdgpu_asic_init_doorbell_index(adev);
1045 :
1046 : /* doorbell bar mapping */
1047 0 : adev->doorbell.base = pci_resource_start(adev->pdev, 2);
1048 0 : adev->doorbell.size = pci_resource_len(adev->pdev, 2);
1049 :
1050 0 : if (adev->enable_mes) {
1051 0 : adev->doorbell.num_doorbells =
1052 0 : adev->doorbell.size / sizeof(u32);
1053 : } else {
1054 0 : adev->doorbell.num_doorbells =
1055 0 : min_t(u32, adev->doorbell.size / sizeof(u32),
1056 : adev->doorbell_index.max_assignment+1);
1057 0 : if (adev->doorbell.num_doorbells == 0)
1058 : return -EINVAL;
1059 :
1060 : /* For Vega, reserve and map two pages on doorbell BAR since SDMA
1061 : * paging queue doorbell use the second page. The
1062 : * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
1063 : * doorbells are in the first page. So with paging queue enabled,
1064 : * the max num_doorbells should + 1 page (0x400 in dword)
1065 : */
1066 0 : if (adev->asic_type >= CHIP_VEGA10)
1067 0 : adev->doorbell.num_doorbells += 0x400;
1068 : }
1069 :
1070 0 : adev->doorbell.ptr = ioremap(adev->doorbell.base,
1071 0 : adev->doorbell.num_doorbells *
1072 : sizeof(u32));
1073 0 : if (adev->doorbell.ptr == NULL)
1074 : return -ENOMEM;
1075 :
1076 0 : return 0;
1077 : }
1078 :
1079 : /**
1080 : * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
1081 : *
1082 : * @adev: amdgpu_device pointer
1083 : *
1084 : * Tear down doorbell driver information (CIK)
1085 : */
1086 : static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
1087 : {
1088 0 : iounmap(adev->doorbell.ptr);
1089 0 : adev->doorbell.ptr = NULL;
1090 : }
1091 :
1092 :
1093 :
1094 : /*
1095 : * amdgpu_device_wb_*()
1096 : * Writeback is the method by which the GPU updates special pages in memory
1097 : * with the status of certain GPU events (fences, ring pointers,etc.).
1098 : */
1099 :
1100 : /**
1101 : * amdgpu_device_wb_fini - Disable Writeback and free memory
1102 : *
1103 : * @adev: amdgpu_device pointer
1104 : *
1105 : * Disables Writeback and frees the Writeback memory (all asics).
1106 : * Used at driver shutdown.
1107 : */
1108 : static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1109 : {
1110 0 : if (adev->wb.wb_obj) {
1111 0 : amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1112 0 : &adev->wb.gpu_addr,
1113 0 : (void **)&adev->wb.wb);
1114 0 : adev->wb.wb_obj = NULL;
1115 : }
1116 : }
1117 :
1118 : /**
1119 : * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1120 : *
1121 : * @adev: amdgpu_device pointer
1122 : *
1123 : * Initializes writeback and allocates writeback memory (all asics).
1124 : * Used at driver startup.
1125 : * Returns 0 on success or an -error on failure.
1126 : */
1127 0 : static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1128 : {
1129 : int r;
1130 :
1131 0 : if (adev->wb.wb_obj == NULL) {
1132 : /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1133 0 : r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1134 : PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1135 0 : &adev->wb.wb_obj, &adev->wb.gpu_addr,
1136 0 : (void **)&adev->wb.wb);
1137 0 : if (r) {
1138 0 : dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1139 0 : return r;
1140 : }
1141 :
1142 0 : adev->wb.num_wb = AMDGPU_MAX_WB;
1143 0 : memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1144 :
1145 : /* clear wb memory */
1146 0 : memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1147 : }
1148 :
1149 : return 0;
1150 : }
1151 :
1152 : /**
1153 : * amdgpu_device_wb_get - Allocate a wb entry
1154 : *
1155 : * @adev: amdgpu_device pointer
1156 : * @wb: wb index
1157 : *
1158 : * Allocate a wb slot for use by the driver (all asics).
1159 : * Returns 0 on success or -EINVAL on failure.
1160 : */
1161 0 : int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1162 : {
1163 0 : unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1164 :
1165 0 : if (offset < adev->wb.num_wb) {
1166 0 : __set_bit(offset, adev->wb.used);
1167 0 : *wb = offset << 3; /* convert to dw offset */
1168 0 : return 0;
1169 : } else {
1170 : return -EINVAL;
1171 : }
1172 : }
1173 :
1174 : /**
1175 : * amdgpu_device_wb_free - Free a wb entry
1176 : *
1177 : * @adev: amdgpu_device pointer
1178 : * @wb: wb index
1179 : *
1180 : * Free a wb slot allocated for use by the driver (all asics)
1181 : */
1182 0 : void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1183 : {
1184 0 : wb >>= 3;
1185 0 : if (wb < adev->wb.num_wb)
1186 0 : __clear_bit(wb, adev->wb.used);
1187 0 : }
1188 :
1189 : /**
1190 : * amdgpu_device_resize_fb_bar - try to resize FB BAR
1191 : *
1192 : * @adev: amdgpu_device pointer
1193 : *
1194 : * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1195 : * to fail, but if any of the BARs is not accessible after the size we abort
1196 : * driver loading by returning -ENODEV.
1197 : */
1198 0 : int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1199 : {
1200 0 : int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1201 : struct pci_bus *root;
1202 : struct resource *res;
1203 : unsigned i;
1204 : u16 cmd;
1205 : int r;
1206 :
1207 : /* Bypass for VF */
1208 0 : if (amdgpu_sriov_vf(adev))
1209 : return 0;
1210 :
1211 : /* skip if the bios has already enabled large BAR */
1212 0 : if (adev->gmc.real_vram_size &&
1213 0 : (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1214 : return 0;
1215 :
1216 : /* Check if the root BUS has 64bit memory resources */
1217 0 : root = adev->pdev->bus;
1218 0 : while (root->parent)
1219 : root = root->parent;
1220 :
1221 0 : pci_bus_for_each_resource(root, res, i) {
1222 0 : if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1223 0 : res->start > 0x100000000ull)
1224 : break;
1225 : }
1226 :
1227 : /* Trying to resize is pointless without a root hub window above 4GB */
1228 0 : if (!res)
1229 : return 0;
1230 :
1231 : /* Limit the BAR size to what is available */
1232 0 : rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1233 : rbar_size);
1234 :
1235 : /* Disable memory decoding while we change the BAR addresses and size */
1236 0 : pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1237 0 : pci_write_config_word(adev->pdev, PCI_COMMAND,
1238 0 : cmd & ~PCI_COMMAND_MEMORY);
1239 :
1240 : /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1241 0 : amdgpu_device_doorbell_fini(adev);
1242 0 : if (adev->asic_type >= CHIP_BONAIRE)
1243 0 : pci_release_resource(adev->pdev, 2);
1244 :
1245 0 : pci_release_resource(adev->pdev, 0);
1246 :
1247 0 : r = pci_resize_resource(adev->pdev, 0, rbar_size);
1248 0 : if (r == -ENOSPC)
1249 0 : DRM_INFO("Not enough PCI address space for a large BAR.");
1250 0 : else if (r && r != -ENOTSUPP)
1251 0 : DRM_ERROR("Problem resizing BAR0 (%d).", r);
1252 :
1253 0 : pci_assign_unassigned_bus_resources(adev->pdev->bus);
1254 :
1255 : /* When the doorbell or fb BAR isn't available we have no chance of
1256 : * using the device.
1257 : */
1258 0 : r = amdgpu_device_doorbell_init(adev);
1259 0 : if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1260 : return -ENODEV;
1261 :
1262 0 : pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1263 :
1264 0 : return 0;
1265 : }
1266 :
1267 : /*
1268 : * GPU helpers function.
1269 : */
1270 : /**
1271 : * amdgpu_device_need_post - check if the hw need post or not
1272 : *
1273 : * @adev: amdgpu_device pointer
1274 : *
1275 : * Check if the asic has been initialized (all asics) at driver startup
1276 : * or post is needed if hw reset is performed.
1277 : * Returns true if need or false if not.
1278 : */
1279 0 : bool amdgpu_device_need_post(struct amdgpu_device *adev)
1280 : {
1281 : uint32_t reg;
1282 :
1283 0 : if (amdgpu_sriov_vf(adev))
1284 : return false;
1285 :
1286 0 : if (amdgpu_passthrough(adev)) {
1287 : /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1288 : * some old smc fw still need driver do vPost otherwise gpu hang, while
1289 : * those smc fw version above 22.15 doesn't have this flaw, so we force
1290 : * vpost executed for smc version below 22.15
1291 : */
1292 0 : if (adev->asic_type == CHIP_FIJI) {
1293 : int err;
1294 : uint32_t fw_ver;
1295 0 : err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1296 : /* force vPost if error occured */
1297 0 : if (err)
1298 : return true;
1299 :
1300 0 : fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1301 0 : if (fw_ver < 0x00160e00)
1302 : return true;
1303 : }
1304 : }
1305 :
1306 : /* Don't post if we need to reset whole hive on init */
1307 0 : if (adev->gmc.xgmi.pending_reset)
1308 : return false;
1309 :
1310 0 : if (adev->has_hw_reset) {
1311 0 : adev->has_hw_reset = false;
1312 0 : return true;
1313 : }
1314 :
1315 : /* bios scratch used on CIK+ */
1316 0 : if (adev->asic_type >= CHIP_BONAIRE)
1317 0 : return amdgpu_atombios_scratch_need_asic_init(adev);
1318 :
1319 : /* check MEM_SIZE for older asics */
1320 0 : reg = amdgpu_asic_get_config_memsize(adev);
1321 :
1322 0 : if ((reg != 0) && (reg != 0xffffffff))
1323 : return false;
1324 :
1325 0 : return true;
1326 : }
1327 :
1328 : /**
1329 : * amdgpu_device_should_use_aspm - check if the device should program ASPM
1330 : *
1331 : * @adev: amdgpu_device pointer
1332 : *
1333 : * Confirm whether the module parameter and pcie bridge agree that ASPM should
1334 : * be set for this device.
1335 : *
1336 : * Returns true if it should be used or false if not.
1337 : */
1338 0 : bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1339 : {
1340 0 : switch (amdgpu_aspm) {
1341 : case -1:
1342 : break;
1343 : case 0:
1344 : return false;
1345 : case 1:
1346 0 : return true;
1347 : default:
1348 : return false;
1349 : }
1350 0 : return pcie_aspm_enabled(adev->pdev);
1351 : }
1352 :
1353 : /* if we get transitioned to only one device, take VGA back */
1354 : /**
1355 : * amdgpu_device_vga_set_decode - enable/disable vga decode
1356 : *
1357 : * @pdev: PCI device pointer
1358 : * @state: enable/disable vga decode
1359 : *
1360 : * Enable/disable vga decode (all asics).
1361 : * Returns VGA resource flags.
1362 : */
1363 0 : static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1364 : bool state)
1365 : {
1366 0 : struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1367 0 : amdgpu_asic_set_vga_state(adev, state);
1368 0 : if (state)
1369 : return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1370 : VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1371 : else
1372 0 : return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1373 : }
1374 :
1375 : /**
1376 : * amdgpu_device_check_block_size - validate the vm block size
1377 : *
1378 : * @adev: amdgpu_device pointer
1379 : *
1380 : * Validates the vm block size specified via module parameter.
1381 : * The vm block size defines number of bits in page table versus page directory,
1382 : * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1383 : * page table and the remaining bits are in the page directory.
1384 : */
1385 0 : static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1386 : {
1387 : /* defines number of bits in page table versus page directory,
1388 : * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1389 : * page table and the remaining bits are in the page directory */
1390 0 : if (amdgpu_vm_block_size == -1)
1391 : return;
1392 :
1393 0 : if (amdgpu_vm_block_size < 9) {
1394 0 : dev_warn(adev->dev, "VM page table size (%d) too small\n",
1395 : amdgpu_vm_block_size);
1396 0 : amdgpu_vm_block_size = -1;
1397 : }
1398 : }
1399 :
1400 : /**
1401 : * amdgpu_device_check_vm_size - validate the vm size
1402 : *
1403 : * @adev: amdgpu_device pointer
1404 : *
1405 : * Validates the vm size in GB specified via module parameter.
1406 : * The VM size is the size of the GPU virtual memory space in GB.
1407 : */
1408 0 : static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1409 : {
1410 : /* no need to check the default value */
1411 0 : if (amdgpu_vm_size == -1)
1412 : return;
1413 :
1414 0 : if (amdgpu_vm_size < 1) {
1415 0 : dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1416 : amdgpu_vm_size);
1417 0 : amdgpu_vm_size = -1;
1418 : }
1419 : }
1420 :
1421 0 : static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1422 : {
1423 : struct sysinfo si;
1424 0 : bool is_os_64 = (sizeof(void *) == 8);
1425 : uint64_t total_memory;
1426 0 : uint64_t dram_size_seven_GB = 0x1B8000000;
1427 0 : uint64_t dram_size_three_GB = 0xB8000000;
1428 :
1429 0 : if (amdgpu_smu_memory_pool_size == 0)
1430 0 : return;
1431 :
1432 : if (!is_os_64) {
1433 : DRM_WARN("Not 64-bit OS, feature not supported\n");
1434 : goto def_value;
1435 : }
1436 0 : si_meminfo(&si);
1437 0 : total_memory = (uint64_t)si.totalram * si.mem_unit;
1438 :
1439 0 : if ((amdgpu_smu_memory_pool_size == 1) ||
1440 : (amdgpu_smu_memory_pool_size == 2)) {
1441 0 : if (total_memory < dram_size_three_GB)
1442 : goto def_value1;
1443 0 : } else if ((amdgpu_smu_memory_pool_size == 4) ||
1444 : (amdgpu_smu_memory_pool_size == 8)) {
1445 0 : if (total_memory < dram_size_seven_GB)
1446 : goto def_value1;
1447 : } else {
1448 0 : DRM_WARN("Smu memory pool size not supported\n");
1449 : goto def_value;
1450 : }
1451 0 : adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1452 :
1453 : return;
1454 :
1455 : def_value1:
1456 0 : DRM_WARN("No enough system memory\n");
1457 : def_value:
1458 0 : adev->pm.smu_prv_buffer_size = 0;
1459 : }
1460 :
1461 0 : static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1462 : {
1463 0 : if (!(adev->flags & AMD_IS_APU) ||
1464 0 : adev->asic_type < CHIP_RAVEN)
1465 : return 0;
1466 :
1467 0 : switch (adev->asic_type) {
1468 : case CHIP_RAVEN:
1469 0 : if (adev->pdev->device == 0x15dd)
1470 0 : adev->apu_flags |= AMD_APU_IS_RAVEN;
1471 0 : if (adev->pdev->device == 0x15d8)
1472 0 : adev->apu_flags |= AMD_APU_IS_PICASSO;
1473 : break;
1474 : case CHIP_RENOIR:
1475 0 : if ((adev->pdev->device == 0x1636) ||
1476 : (adev->pdev->device == 0x164c))
1477 0 : adev->apu_flags |= AMD_APU_IS_RENOIR;
1478 : else
1479 0 : adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1480 : break;
1481 : case CHIP_VANGOGH:
1482 0 : adev->apu_flags |= AMD_APU_IS_VANGOGH;
1483 0 : break;
1484 : case CHIP_YELLOW_CARP:
1485 : break;
1486 : case CHIP_CYAN_SKILLFISH:
1487 0 : if ((adev->pdev->device == 0x13FE) ||
1488 : (adev->pdev->device == 0x143F))
1489 0 : adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1490 : break;
1491 : default:
1492 : break;
1493 : }
1494 :
1495 : return 0;
1496 : }
1497 :
1498 : /**
1499 : * amdgpu_device_check_arguments - validate module params
1500 : *
1501 : * @adev: amdgpu_device pointer
1502 : *
1503 : * Validates certain module parameters and updates
1504 : * the associated values used by the driver (all asics).
1505 : */
1506 0 : static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1507 : {
1508 0 : if (amdgpu_sched_jobs < 4) {
1509 0 : dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1510 : amdgpu_sched_jobs);
1511 0 : amdgpu_sched_jobs = 4;
1512 0 : } else if (!is_power_of_2(amdgpu_sched_jobs)){
1513 0 : dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1514 : amdgpu_sched_jobs);
1515 0 : amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1516 : }
1517 :
1518 0 : if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1519 : /* gart size must be greater or equal to 32M */
1520 0 : dev_warn(adev->dev, "gart size (%d) too small\n",
1521 : amdgpu_gart_size);
1522 0 : amdgpu_gart_size = -1;
1523 : }
1524 :
1525 0 : if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1526 : /* gtt size must be greater or equal to 32M */
1527 0 : dev_warn(adev->dev, "gtt size (%d) too small\n",
1528 : amdgpu_gtt_size);
1529 0 : amdgpu_gtt_size = -1;
1530 : }
1531 :
1532 : /* valid range is between 4 and 9 inclusive */
1533 0 : if (amdgpu_vm_fragment_size != -1 &&
1534 : (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1535 0 : dev_warn(adev->dev, "valid range is between 4 and 9\n");
1536 0 : amdgpu_vm_fragment_size = -1;
1537 : }
1538 :
1539 0 : if (amdgpu_sched_hw_submission < 2) {
1540 0 : dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1541 : amdgpu_sched_hw_submission);
1542 0 : amdgpu_sched_hw_submission = 2;
1543 0 : } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1544 0 : dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1545 : amdgpu_sched_hw_submission);
1546 0 : amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1547 : }
1548 :
1549 0 : if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1550 0 : dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1551 0 : amdgpu_reset_method = -1;
1552 : }
1553 :
1554 0 : amdgpu_device_check_smu_prv_buffer_size(adev);
1555 :
1556 0 : amdgpu_device_check_vm_size(adev);
1557 :
1558 0 : amdgpu_device_check_block_size(adev);
1559 :
1560 0 : adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1561 :
1562 0 : return 0;
1563 : }
1564 :
1565 : /**
1566 : * amdgpu_switcheroo_set_state - set switcheroo state
1567 : *
1568 : * @pdev: pci dev pointer
1569 : * @state: vga_switcheroo state
1570 : *
1571 : * Callback for the switcheroo driver. Suspends or resumes the
1572 : * the asics before or after it is powered up using ACPI methods.
1573 : */
1574 : static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1575 : enum vga_switcheroo_state state)
1576 : {
1577 : struct drm_device *dev = pci_get_drvdata(pdev);
1578 : int r;
1579 :
1580 : if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1581 : return;
1582 :
1583 : if (state == VGA_SWITCHEROO_ON) {
1584 : pr_info("switched on\n");
1585 : /* don't suspend or resume card normally */
1586 : dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1587 :
1588 : pci_set_power_state(pdev, PCI_D0);
1589 : amdgpu_device_load_pci_state(pdev);
1590 : r = pci_enable_device(pdev);
1591 : if (r)
1592 : DRM_WARN("pci_enable_device failed (%d)\n", r);
1593 : amdgpu_device_resume(dev, true);
1594 :
1595 : dev->switch_power_state = DRM_SWITCH_POWER_ON;
1596 : } else {
1597 : pr_info("switched off\n");
1598 : dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1599 : amdgpu_device_suspend(dev, true);
1600 : amdgpu_device_cache_pci_state(pdev);
1601 : /* Shut down the device */
1602 : pci_disable_device(pdev);
1603 : pci_set_power_state(pdev, PCI_D3cold);
1604 : dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1605 : }
1606 : }
1607 :
1608 : /**
1609 : * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1610 : *
1611 : * @pdev: pci dev pointer
1612 : *
1613 : * Callback for the switcheroo driver. Check of the switcheroo
1614 : * state can be changed.
1615 : * Returns true if the state can be changed, false if not.
1616 : */
1617 : static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1618 : {
1619 : struct drm_device *dev = pci_get_drvdata(pdev);
1620 :
1621 : /*
1622 : * FIXME: open_count is protected by drm_global_mutex but that would lead to
1623 : * locking inversion with the driver load path. And the access here is
1624 : * completely racy anyway. So don't bother with locking for now.
1625 : */
1626 : return atomic_read(&dev->open_count) == 0;
1627 : }
1628 :
1629 : static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1630 : .set_gpu_state = amdgpu_switcheroo_set_state,
1631 : .reprobe = NULL,
1632 : .can_switch = amdgpu_switcheroo_can_switch,
1633 : };
1634 :
1635 : /**
1636 : * amdgpu_device_ip_set_clockgating_state - set the CG state
1637 : *
1638 : * @dev: amdgpu_device pointer
1639 : * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1640 : * @state: clockgating state (gate or ungate)
1641 : *
1642 : * Sets the requested clockgating state for all instances of
1643 : * the hardware IP specified.
1644 : * Returns the error code from the last instance.
1645 : */
1646 0 : int amdgpu_device_ip_set_clockgating_state(void *dev,
1647 : enum amd_ip_block_type block_type,
1648 : enum amd_clockgating_state state)
1649 : {
1650 0 : struct amdgpu_device *adev = dev;
1651 0 : int i, r = 0;
1652 :
1653 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
1654 0 : if (!adev->ip_blocks[i].status.valid)
1655 0 : continue;
1656 0 : if (adev->ip_blocks[i].version->type != block_type)
1657 0 : continue;
1658 0 : if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1659 0 : continue;
1660 0 : r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1661 : (void *)adev, state);
1662 0 : if (r)
1663 0 : DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1664 : adev->ip_blocks[i].version->funcs->name, r);
1665 : }
1666 0 : return r;
1667 : }
1668 :
1669 : /**
1670 : * amdgpu_device_ip_set_powergating_state - set the PG state
1671 : *
1672 : * @dev: amdgpu_device pointer
1673 : * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1674 : * @state: powergating state (gate or ungate)
1675 : *
1676 : * Sets the requested powergating state for all instances of
1677 : * the hardware IP specified.
1678 : * Returns the error code from the last instance.
1679 : */
1680 0 : int amdgpu_device_ip_set_powergating_state(void *dev,
1681 : enum amd_ip_block_type block_type,
1682 : enum amd_powergating_state state)
1683 : {
1684 0 : struct amdgpu_device *adev = dev;
1685 0 : int i, r = 0;
1686 :
1687 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
1688 0 : if (!adev->ip_blocks[i].status.valid)
1689 0 : continue;
1690 0 : if (adev->ip_blocks[i].version->type != block_type)
1691 0 : continue;
1692 0 : if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1693 0 : continue;
1694 0 : r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1695 : (void *)adev, state);
1696 0 : if (r)
1697 0 : DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1698 : adev->ip_blocks[i].version->funcs->name, r);
1699 : }
1700 0 : return r;
1701 : }
1702 :
1703 : /**
1704 : * amdgpu_device_ip_get_clockgating_state - get the CG state
1705 : *
1706 : * @adev: amdgpu_device pointer
1707 : * @flags: clockgating feature flags
1708 : *
1709 : * Walks the list of IPs on the device and updates the clockgating
1710 : * flags for each IP.
1711 : * Updates @flags with the feature flags for each hardware IP where
1712 : * clockgating is enabled.
1713 : */
1714 0 : void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1715 : u64 *flags)
1716 : {
1717 : int i;
1718 :
1719 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
1720 0 : if (!adev->ip_blocks[i].status.valid)
1721 0 : continue;
1722 0 : if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1723 0 : adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1724 : }
1725 0 : }
1726 :
1727 : /**
1728 : * amdgpu_device_ip_wait_for_idle - wait for idle
1729 : *
1730 : * @adev: amdgpu_device pointer
1731 : * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1732 : *
1733 : * Waits for the request hardware IP to be idle.
1734 : * Returns 0 for success or a negative error code on failure.
1735 : */
1736 0 : int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1737 : enum amd_ip_block_type block_type)
1738 : {
1739 : int i, r;
1740 :
1741 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
1742 0 : if (!adev->ip_blocks[i].status.valid)
1743 0 : continue;
1744 0 : if (adev->ip_blocks[i].version->type == block_type) {
1745 0 : r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1746 0 : if (r)
1747 : return r;
1748 : break;
1749 : }
1750 : }
1751 : return 0;
1752 :
1753 : }
1754 :
1755 : /**
1756 : * amdgpu_device_ip_is_idle - is the hardware IP idle
1757 : *
1758 : * @adev: amdgpu_device pointer
1759 : * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1760 : *
1761 : * Check if the hardware IP is idle or not.
1762 : * Returns true if it the IP is idle, false if not.
1763 : */
1764 0 : bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1765 : enum amd_ip_block_type block_type)
1766 : {
1767 : int i;
1768 :
1769 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
1770 0 : if (!adev->ip_blocks[i].status.valid)
1771 0 : continue;
1772 0 : if (adev->ip_blocks[i].version->type == block_type)
1773 0 : return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1774 : }
1775 : return true;
1776 :
1777 : }
1778 :
1779 : /**
1780 : * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1781 : *
1782 : * @adev: amdgpu_device pointer
1783 : * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1784 : *
1785 : * Returns a pointer to the hardware IP block structure
1786 : * if it exists for the asic, otherwise NULL.
1787 : */
1788 : struct amdgpu_ip_block *
1789 0 : amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1790 : enum amd_ip_block_type type)
1791 : {
1792 : int i;
1793 :
1794 0 : for (i = 0; i < adev->num_ip_blocks; i++)
1795 0 : if (adev->ip_blocks[i].version->type == type)
1796 0 : return &adev->ip_blocks[i];
1797 :
1798 : return NULL;
1799 : }
1800 :
1801 : /**
1802 : * amdgpu_device_ip_block_version_cmp
1803 : *
1804 : * @adev: amdgpu_device pointer
1805 : * @type: enum amd_ip_block_type
1806 : * @major: major version
1807 : * @minor: minor version
1808 : *
1809 : * return 0 if equal or greater
1810 : * return 1 if smaller or the ip_block doesn't exist
1811 : */
1812 0 : int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1813 : enum amd_ip_block_type type,
1814 : u32 major, u32 minor)
1815 : {
1816 0 : struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1817 :
1818 0 : if (ip_block && ((ip_block->version->major > major) ||
1819 0 : ((ip_block->version->major == major) &&
1820 0 : (ip_block->version->minor >= minor))))
1821 : return 0;
1822 :
1823 0 : return 1;
1824 : }
1825 :
1826 : /**
1827 : * amdgpu_device_ip_block_add
1828 : *
1829 : * @adev: amdgpu_device pointer
1830 : * @ip_block_version: pointer to the IP to add
1831 : *
1832 : * Adds the IP block driver information to the collection of IPs
1833 : * on the asic.
1834 : */
1835 0 : int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1836 : const struct amdgpu_ip_block_version *ip_block_version)
1837 : {
1838 0 : if (!ip_block_version)
1839 : return -EINVAL;
1840 :
1841 0 : switch (ip_block_version->type) {
1842 : case AMD_IP_BLOCK_TYPE_VCN:
1843 0 : if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1844 : return 0;
1845 : break;
1846 : case AMD_IP_BLOCK_TYPE_JPEG:
1847 0 : if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1848 : return 0;
1849 : break;
1850 : default:
1851 : break;
1852 : }
1853 :
1854 0 : DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1855 : ip_block_version->funcs->name);
1856 :
1857 0 : adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1858 :
1859 0 : return 0;
1860 : }
1861 :
1862 : /**
1863 : * amdgpu_device_enable_virtual_display - enable virtual display feature
1864 : *
1865 : * @adev: amdgpu_device pointer
1866 : *
1867 : * Enabled the virtual display feature if the user has enabled it via
1868 : * the module parameter virtual_display. This feature provides a virtual
1869 : * display hardware on headless boards or in virtualized environments.
1870 : * This function parses and validates the configuration string specified by
1871 : * the user and configues the virtual display configuration (number of
1872 : * virtual connectors, crtcs, etc.) specified.
1873 : */
1874 0 : static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1875 : {
1876 0 : adev->enable_virtual_display = false;
1877 :
1878 0 : if (amdgpu_virtual_display) {
1879 0 : const char *pci_address_name = pci_name(adev->pdev);
1880 : char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1881 :
1882 0 : pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1883 0 : pciaddstr_tmp = pciaddstr;
1884 0 : while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1885 0 : pciaddname = strsep(&pciaddname_tmp, ",");
1886 0 : if (!strcmp("all", pciaddname)
1887 0 : || !strcmp(pci_address_name, pciaddname)) {
1888 : long num_crtc;
1889 0 : int res = -1;
1890 :
1891 0 : adev->enable_virtual_display = true;
1892 :
1893 0 : if (pciaddname_tmp)
1894 0 : res = kstrtol(pciaddname_tmp, 10,
1895 : &num_crtc);
1896 :
1897 0 : if (!res) {
1898 0 : if (num_crtc < 1)
1899 0 : num_crtc = 1;
1900 0 : if (num_crtc > 6)
1901 0 : num_crtc = 6;
1902 0 : adev->mode_info.num_crtc = num_crtc;
1903 : } else {
1904 0 : adev->mode_info.num_crtc = 1;
1905 : }
1906 : break;
1907 : }
1908 : }
1909 :
1910 0 : DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1911 : amdgpu_virtual_display, pci_address_name,
1912 : adev->enable_virtual_display, adev->mode_info.num_crtc);
1913 :
1914 0 : kfree(pciaddstr);
1915 : }
1916 0 : }
1917 :
1918 : /**
1919 : * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1920 : *
1921 : * @adev: amdgpu_device pointer
1922 : *
1923 : * Parses the asic configuration parameters specified in the gpu info
1924 : * firmware and makes them availale to the driver for use in configuring
1925 : * the asic.
1926 : * Returns 0 on success, -EINVAL on failure.
1927 : */
1928 0 : static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1929 : {
1930 : const char *chip_name;
1931 : char fw_name[40];
1932 : int err;
1933 : const struct gpu_info_firmware_header_v1_0 *hdr;
1934 :
1935 0 : adev->firmware.gpu_info_fw = NULL;
1936 :
1937 0 : if (adev->mman.discovery_bin) {
1938 : /*
1939 : * FIXME: The bounding box is still needed by Navi12, so
1940 : * temporarily read it from gpu_info firmware. Should be dropped
1941 : * when DAL no longer needs it.
1942 : */
1943 0 : if (adev->asic_type != CHIP_NAVI12)
1944 : return 0;
1945 : }
1946 :
1947 0 : switch (adev->asic_type) {
1948 : default:
1949 : return 0;
1950 : case CHIP_VEGA10:
1951 : chip_name = "vega10";
1952 : break;
1953 : case CHIP_VEGA12:
1954 0 : chip_name = "vega12";
1955 0 : break;
1956 : case CHIP_RAVEN:
1957 0 : if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1958 : chip_name = "raven2";
1959 0 : else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1960 : chip_name = "picasso";
1961 : else
1962 0 : chip_name = "raven";
1963 : break;
1964 : case CHIP_ARCTURUS:
1965 0 : chip_name = "arcturus";
1966 0 : break;
1967 : case CHIP_NAVI12:
1968 0 : chip_name = "navi12";
1969 0 : break;
1970 : }
1971 :
1972 0 : snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1973 0 : err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1974 0 : if (err) {
1975 0 : dev_err(adev->dev,
1976 : "Failed to load gpu_info firmware \"%s\"\n",
1977 : fw_name);
1978 0 : goto out;
1979 : }
1980 0 : err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1981 0 : if (err) {
1982 0 : dev_err(adev->dev,
1983 : "Failed to validate gpu_info firmware \"%s\"\n",
1984 : fw_name);
1985 0 : goto out;
1986 : }
1987 :
1988 0 : hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1989 0 : amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1990 :
1991 0 : switch (hdr->version_major) {
1992 : case 1:
1993 : {
1994 0 : const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1995 0 : (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1996 0 : le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1997 :
1998 : /*
1999 : * Should be droped when DAL no longer needs it.
2000 : */
2001 0 : if (adev->asic_type == CHIP_NAVI12)
2002 : goto parse_soc_bounding_box;
2003 :
2004 0 : adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2005 0 : adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2006 0 : adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2007 0 : adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2008 0 : adev->gfx.config.max_texture_channel_caches =
2009 0 : le32_to_cpu(gpu_info_fw->gc_num_tccs);
2010 0 : adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2011 0 : adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2012 0 : adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2013 0 : adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2014 0 : adev->gfx.config.double_offchip_lds_buf =
2015 0 : le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2016 0 : adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2017 0 : adev->gfx.cu_info.max_waves_per_simd =
2018 0 : le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2019 0 : adev->gfx.cu_info.max_scratch_slots_per_cu =
2020 0 : le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2021 0 : adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2022 0 : if (hdr->version_minor >= 1) {
2023 0 : const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2024 0 : (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2025 0 : le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2026 0 : adev->gfx.config.num_sc_per_sh =
2027 0 : le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2028 0 : adev->gfx.config.num_packer_per_sc =
2029 0 : le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2030 : }
2031 :
2032 : parse_soc_bounding_box:
2033 : /*
2034 : * soc bounding box info is not integrated in disocovery table,
2035 : * we always need to parse it from gpu info firmware if needed.
2036 : */
2037 0 : if (hdr->version_minor == 2) {
2038 0 : const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2039 0 : (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2040 0 : le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2041 0 : adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2042 : }
2043 : break;
2044 : }
2045 : default:
2046 0 : dev_err(adev->dev,
2047 : "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2048 0 : err = -EINVAL;
2049 0 : goto out;
2050 : }
2051 : out:
2052 : return err;
2053 : }
2054 :
2055 : /**
2056 : * amdgpu_device_ip_early_init - run early init for hardware IPs
2057 : *
2058 : * @adev: amdgpu_device pointer
2059 : *
2060 : * Early initialization pass for hardware IPs. The hardware IPs that make
2061 : * up each asic are discovered each IP's early_init callback is run. This
2062 : * is the first stage in initializing the asic.
2063 : * Returns 0 on success, negative error code on failure.
2064 : */
2065 0 : static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2066 : {
2067 0 : struct drm_device *dev = adev_to_drm(adev);
2068 : struct pci_dev *parent;
2069 : int i, r;
2070 :
2071 0 : amdgpu_device_enable_virtual_display(adev);
2072 :
2073 0 : if (amdgpu_sriov_vf(adev)) {
2074 0 : r = amdgpu_virt_request_full_gpu(adev, true);
2075 0 : if (r)
2076 : return r;
2077 : }
2078 :
2079 0 : switch (adev->asic_type) {
2080 : #ifdef CONFIG_DRM_AMDGPU_SI
2081 : case CHIP_VERDE:
2082 : case CHIP_TAHITI:
2083 : case CHIP_PITCAIRN:
2084 : case CHIP_OLAND:
2085 : case CHIP_HAINAN:
2086 : adev->family = AMDGPU_FAMILY_SI;
2087 : r = si_set_ip_blocks(adev);
2088 : if (r)
2089 : return r;
2090 : break;
2091 : #endif
2092 : #ifdef CONFIG_DRM_AMDGPU_CIK
2093 : case CHIP_BONAIRE:
2094 : case CHIP_HAWAII:
2095 : case CHIP_KAVERI:
2096 : case CHIP_KABINI:
2097 : case CHIP_MULLINS:
2098 : if (adev->flags & AMD_IS_APU)
2099 : adev->family = AMDGPU_FAMILY_KV;
2100 : else
2101 : adev->family = AMDGPU_FAMILY_CI;
2102 :
2103 : r = cik_set_ip_blocks(adev);
2104 : if (r)
2105 : return r;
2106 : break;
2107 : #endif
2108 : case CHIP_TOPAZ:
2109 : case CHIP_TONGA:
2110 : case CHIP_FIJI:
2111 : case CHIP_POLARIS10:
2112 : case CHIP_POLARIS11:
2113 : case CHIP_POLARIS12:
2114 : case CHIP_VEGAM:
2115 : case CHIP_CARRIZO:
2116 : case CHIP_STONEY:
2117 0 : if (adev->flags & AMD_IS_APU)
2118 0 : adev->family = AMDGPU_FAMILY_CZ;
2119 : else
2120 0 : adev->family = AMDGPU_FAMILY_VI;
2121 :
2122 0 : r = vi_set_ip_blocks(adev);
2123 0 : if (r)
2124 : return r;
2125 : break;
2126 : default:
2127 0 : r = amdgpu_discovery_set_ip_blocks(adev);
2128 0 : if (r)
2129 : return r;
2130 : break;
2131 : }
2132 :
2133 : if (amdgpu_has_atpx() &&
2134 : (amdgpu_is_atpx_hybrid() ||
2135 : amdgpu_has_atpx_dgpu_power_cntl()) &&
2136 : ((adev->flags & AMD_IS_APU) == 0) &&
2137 : !pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
2138 : adev->flags |= AMD_IS_PX;
2139 :
2140 0 : if (!(adev->flags & AMD_IS_APU)) {
2141 0 : parent = pci_upstream_bridge(adev->pdev);
2142 0 : adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2143 : }
2144 :
2145 0 : amdgpu_amdkfd_device_probe(adev);
2146 :
2147 0 : adev->pm.pp_feature = amdgpu_pp_feature_mask;
2148 0 : if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2149 0 : adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2150 0 : if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2151 0 : adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2152 :
2153 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
2154 0 : if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2155 0 : DRM_ERROR("disabled ip block: %d <%s>\n",
2156 : i, adev->ip_blocks[i].version->funcs->name);
2157 0 : adev->ip_blocks[i].status.valid = false;
2158 : } else {
2159 0 : if (adev->ip_blocks[i].version->funcs->early_init) {
2160 0 : r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2161 0 : if (r == -ENOENT) {
2162 0 : adev->ip_blocks[i].status.valid = false;
2163 0 : } else if (r) {
2164 0 : DRM_ERROR("early_init of IP block <%s> failed %d\n",
2165 : adev->ip_blocks[i].version->funcs->name, r);
2166 0 : return r;
2167 : } else {
2168 0 : adev->ip_blocks[i].status.valid = true;
2169 : }
2170 : } else {
2171 0 : adev->ip_blocks[i].status.valid = true;
2172 : }
2173 : }
2174 : /* get the vbios after the asic_funcs are set up */
2175 0 : if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2176 0 : r = amdgpu_device_parse_gpu_info_fw(adev);
2177 0 : if (r)
2178 : return r;
2179 :
2180 : /* Read BIOS */
2181 0 : if (!amdgpu_get_bios(adev))
2182 : return -EINVAL;
2183 :
2184 0 : r = amdgpu_atombios_init(adev);
2185 0 : if (r) {
2186 0 : dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2187 0 : amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2188 0 : return r;
2189 : }
2190 :
2191 : /*get pf2vf msg info at it's earliest time*/
2192 0 : if (amdgpu_sriov_vf(adev))
2193 0 : amdgpu_virt_init_data_exchange(adev);
2194 :
2195 : }
2196 : }
2197 :
2198 0 : adev->cg_flags &= amdgpu_cg_mask;
2199 0 : adev->pg_flags &= amdgpu_pg_mask;
2200 :
2201 0 : return 0;
2202 : }
2203 :
2204 0 : static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2205 : {
2206 : int i, r;
2207 :
2208 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
2209 0 : if (!adev->ip_blocks[i].status.sw)
2210 0 : continue;
2211 0 : if (adev->ip_blocks[i].status.hw)
2212 0 : continue;
2213 0 : if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2214 0 : (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2215 : adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2216 0 : r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2217 0 : if (r) {
2218 0 : DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2219 : adev->ip_blocks[i].version->funcs->name, r);
2220 0 : return r;
2221 : }
2222 0 : adev->ip_blocks[i].status.hw = true;
2223 : }
2224 : }
2225 :
2226 : return 0;
2227 : }
2228 :
2229 0 : static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2230 : {
2231 : int i, r;
2232 :
2233 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
2234 0 : if (!adev->ip_blocks[i].status.sw)
2235 0 : continue;
2236 0 : if (adev->ip_blocks[i].status.hw)
2237 0 : continue;
2238 0 : r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2239 0 : if (r) {
2240 0 : DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2241 : adev->ip_blocks[i].version->funcs->name, r);
2242 0 : return r;
2243 : }
2244 0 : adev->ip_blocks[i].status.hw = true;
2245 : }
2246 :
2247 : return 0;
2248 : }
2249 :
2250 0 : static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2251 : {
2252 0 : int r = 0;
2253 : int i;
2254 : uint32_t smu_version;
2255 :
2256 0 : if (adev->asic_type >= CHIP_VEGA10) {
2257 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
2258 0 : if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2259 0 : continue;
2260 :
2261 0 : if (!adev->ip_blocks[i].status.sw)
2262 0 : continue;
2263 :
2264 : /* no need to do the fw loading again if already done*/
2265 0 : if (adev->ip_blocks[i].status.hw == true)
2266 : break;
2267 :
2268 0 : if (amdgpu_in_reset(adev) || adev->in_suspend) {
2269 0 : r = adev->ip_blocks[i].version->funcs->resume(adev);
2270 0 : if (r) {
2271 0 : DRM_ERROR("resume of IP block <%s> failed %d\n",
2272 : adev->ip_blocks[i].version->funcs->name, r);
2273 0 : return r;
2274 : }
2275 : } else {
2276 0 : r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2277 0 : if (r) {
2278 0 : DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2279 : adev->ip_blocks[i].version->funcs->name, r);
2280 0 : return r;
2281 : }
2282 : }
2283 :
2284 0 : adev->ip_blocks[i].status.hw = true;
2285 0 : break;
2286 : }
2287 : }
2288 :
2289 0 : if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2290 0 : r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2291 :
2292 : return r;
2293 : }
2294 :
2295 0 : static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2296 : {
2297 : long timeout;
2298 : int r, i;
2299 :
2300 0 : for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2301 0 : struct amdgpu_ring *ring = adev->rings[i];
2302 :
2303 : /* No need to setup the GPU scheduler for rings that don't need it */
2304 0 : if (!ring || ring->no_scheduler)
2305 0 : continue;
2306 :
2307 0 : switch (ring->funcs->type) {
2308 : case AMDGPU_RING_TYPE_GFX:
2309 0 : timeout = adev->gfx_timeout;
2310 0 : break;
2311 : case AMDGPU_RING_TYPE_COMPUTE:
2312 0 : timeout = adev->compute_timeout;
2313 0 : break;
2314 : case AMDGPU_RING_TYPE_SDMA:
2315 0 : timeout = adev->sdma_timeout;
2316 0 : break;
2317 : default:
2318 0 : timeout = adev->video_timeout;
2319 0 : break;
2320 : }
2321 :
2322 0 : r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2323 : ring->num_hw_submission, amdgpu_job_hang_limit,
2324 0 : timeout, adev->reset_domain->wq,
2325 0 : ring->sched_score, ring->name,
2326 : adev->dev);
2327 0 : if (r) {
2328 0 : DRM_ERROR("Failed to create scheduler on ring %s.\n",
2329 : ring->name);
2330 0 : return r;
2331 : }
2332 : }
2333 :
2334 : return 0;
2335 : }
2336 :
2337 :
2338 : /**
2339 : * amdgpu_device_ip_init - run init for hardware IPs
2340 : *
2341 : * @adev: amdgpu_device pointer
2342 : *
2343 : * Main initialization pass for hardware IPs. The list of all the hardware
2344 : * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2345 : * are run. sw_init initializes the software state associated with each IP
2346 : * and hw_init initializes the hardware associated with each IP.
2347 : * Returns 0 on success, negative error code on failure.
2348 : */
2349 0 : static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2350 : {
2351 : int i, r;
2352 :
2353 0 : r = amdgpu_ras_init(adev);
2354 0 : if (r)
2355 : return r;
2356 :
2357 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
2358 0 : if (!adev->ip_blocks[i].status.valid)
2359 0 : continue;
2360 0 : r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2361 0 : if (r) {
2362 0 : DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2363 : adev->ip_blocks[i].version->funcs->name, r);
2364 0 : goto init_failed;
2365 : }
2366 0 : adev->ip_blocks[i].status.sw = true;
2367 :
2368 : /* need to do gmc hw init early so we can allocate gpu mem */
2369 0 : if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2370 : /* Try to reserve bad pages early */
2371 0 : if (amdgpu_sriov_vf(adev))
2372 0 : amdgpu_virt_exchange_data(adev);
2373 :
2374 0 : r = amdgpu_device_vram_scratch_init(adev);
2375 0 : if (r) {
2376 0 : DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2377 0 : goto init_failed;
2378 : }
2379 0 : r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2380 0 : if (r) {
2381 0 : DRM_ERROR("hw_init %d failed %d\n", i, r);
2382 0 : goto init_failed;
2383 : }
2384 0 : r = amdgpu_device_wb_init(adev);
2385 0 : if (r) {
2386 0 : DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2387 0 : goto init_failed;
2388 : }
2389 0 : adev->ip_blocks[i].status.hw = true;
2390 :
2391 : /* right after GMC hw init, we create CSA */
2392 0 : if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2393 0 : r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2394 : AMDGPU_GEM_DOMAIN_VRAM,
2395 : AMDGPU_CSA_SIZE);
2396 0 : if (r) {
2397 0 : DRM_ERROR("allocate CSA failed %d\n", r);
2398 0 : goto init_failed;
2399 : }
2400 : }
2401 : }
2402 : }
2403 :
2404 0 : if (amdgpu_sriov_vf(adev))
2405 0 : amdgpu_virt_init_data_exchange(adev);
2406 :
2407 0 : r = amdgpu_ib_pool_init(adev);
2408 0 : if (r) {
2409 0 : dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2410 0 : amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2411 0 : goto init_failed;
2412 : }
2413 :
2414 0 : r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2415 0 : if (r)
2416 : goto init_failed;
2417 :
2418 0 : r = amdgpu_device_ip_hw_init_phase1(adev);
2419 0 : if (r)
2420 : goto init_failed;
2421 :
2422 0 : r = amdgpu_device_fw_loading(adev);
2423 0 : if (r)
2424 : goto init_failed;
2425 :
2426 0 : r = amdgpu_device_ip_hw_init_phase2(adev);
2427 0 : if (r)
2428 : goto init_failed;
2429 :
2430 : /*
2431 : * retired pages will be loaded from eeprom and reserved here,
2432 : * it should be called after amdgpu_device_ip_hw_init_phase2 since
2433 : * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2434 : * for I2C communication which only true at this point.
2435 : *
2436 : * amdgpu_ras_recovery_init may fail, but the upper only cares the
2437 : * failure from bad gpu situation and stop amdgpu init process
2438 : * accordingly. For other failed cases, it will still release all
2439 : * the resource and print error message, rather than returning one
2440 : * negative value to upper level.
2441 : *
2442 : * Note: theoretically, this should be called before all vram allocations
2443 : * to protect retired page from abusing
2444 : */
2445 0 : r = amdgpu_ras_recovery_init(adev);
2446 0 : if (r)
2447 : goto init_failed;
2448 :
2449 : /**
2450 : * In case of XGMI grab extra reference for reset domain for this device
2451 : */
2452 0 : if (adev->gmc.xgmi.num_physical_nodes > 1) {
2453 0 : if (amdgpu_xgmi_add_device(adev) == 0) {
2454 0 : if (!amdgpu_sriov_vf(adev)) {
2455 0 : struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2456 :
2457 0 : if (!hive->reset_domain ||
2458 0 : !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2459 0 : r = -ENOENT;
2460 0 : amdgpu_put_xgmi_hive(hive);
2461 0 : goto init_failed;
2462 : }
2463 :
2464 : /* Drop the early temporary reset domain we created for device */
2465 0 : amdgpu_reset_put_reset_domain(adev->reset_domain);
2466 0 : adev->reset_domain = hive->reset_domain;
2467 0 : amdgpu_put_xgmi_hive(hive);
2468 : }
2469 : }
2470 : }
2471 :
2472 0 : r = amdgpu_device_init_schedulers(adev);
2473 0 : if (r)
2474 : goto init_failed;
2475 :
2476 : /* Don't init kfd if whole hive need to be reset during init */
2477 0 : if (!adev->gmc.xgmi.pending_reset)
2478 0 : amdgpu_amdkfd_device_init(adev);
2479 :
2480 0 : amdgpu_fru_get_product_info(adev);
2481 :
2482 : init_failed:
2483 0 : if (amdgpu_sriov_vf(adev))
2484 0 : amdgpu_virt_release_full_gpu(adev, true);
2485 :
2486 : return r;
2487 : }
2488 :
2489 : /**
2490 : * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2491 : *
2492 : * @adev: amdgpu_device pointer
2493 : *
2494 : * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2495 : * this function before a GPU reset. If the value is retained after a
2496 : * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2497 : */
2498 : static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2499 : {
2500 0 : memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2501 : }
2502 :
2503 : /**
2504 : * amdgpu_device_check_vram_lost - check if vram is valid
2505 : *
2506 : * @adev: amdgpu_device pointer
2507 : *
2508 : * Checks the reset magic value written to the gart pointer in VRAM.
2509 : * The driver calls this after a GPU reset to see if the contents of
2510 : * VRAM is lost or now.
2511 : * returns true if vram is lost, false if not.
2512 : */
2513 0 : static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2514 : {
2515 0 : if (memcmp(adev->gart.ptr, adev->reset_magic,
2516 : AMDGPU_RESET_MAGIC_NUM))
2517 : return true;
2518 :
2519 0 : if (!amdgpu_in_reset(adev))
2520 : return false;
2521 :
2522 : /*
2523 : * For all ASICs with baco/mode1 reset, the VRAM is
2524 : * always assumed to be lost.
2525 : */
2526 0 : switch (amdgpu_asic_reset_method(adev)) {
2527 : case AMD_RESET_METHOD_BACO:
2528 : case AMD_RESET_METHOD_MODE1:
2529 : return true;
2530 : default:
2531 0 : return false;
2532 : }
2533 : }
2534 :
2535 : /**
2536 : * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2537 : *
2538 : * @adev: amdgpu_device pointer
2539 : * @state: clockgating state (gate or ungate)
2540 : *
2541 : * The list of all the hardware IPs that make up the asic is walked and the
2542 : * set_clockgating_state callbacks are run.
2543 : * Late initialization pass enabling clockgating for hardware IPs.
2544 : * Fini or suspend, pass disabling clockgating for hardware IPs.
2545 : * Returns 0 on success, negative error code on failure.
2546 : */
2547 :
2548 0 : int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2549 : enum amd_clockgating_state state)
2550 : {
2551 : int i, j, r;
2552 :
2553 0 : if (amdgpu_emu_mode == 1)
2554 : return 0;
2555 :
2556 0 : for (j = 0; j < adev->num_ip_blocks; j++) {
2557 0 : i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2558 0 : if (!adev->ip_blocks[i].status.late_initialized)
2559 0 : continue;
2560 : /* skip CG for GFX on S0ix */
2561 0 : if (adev->in_s0ix &&
2562 0 : adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2563 0 : continue;
2564 : /* skip CG for VCE/UVD, it's handled specially */
2565 0 : if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2566 0 : adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2567 0 : adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2568 0 : adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2569 0 : adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2570 : /* enable clockgating to save power */
2571 0 : r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2572 : state);
2573 0 : if (r) {
2574 0 : DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2575 : adev->ip_blocks[i].version->funcs->name, r);
2576 0 : return r;
2577 : }
2578 : }
2579 : }
2580 :
2581 : return 0;
2582 : }
2583 :
2584 0 : int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2585 : enum amd_powergating_state state)
2586 : {
2587 : int i, j, r;
2588 :
2589 0 : if (amdgpu_emu_mode == 1)
2590 : return 0;
2591 :
2592 0 : for (j = 0; j < adev->num_ip_blocks; j++) {
2593 0 : i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2594 0 : if (!adev->ip_blocks[i].status.late_initialized)
2595 0 : continue;
2596 : /* skip PG for GFX on S0ix */
2597 0 : if (adev->in_s0ix &&
2598 0 : adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2599 0 : continue;
2600 : /* skip CG for VCE/UVD, it's handled specially */
2601 0 : if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2602 0 : adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2603 0 : adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2604 0 : adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2605 0 : adev->ip_blocks[i].version->funcs->set_powergating_state) {
2606 : /* enable powergating to save power */
2607 0 : r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2608 : state);
2609 0 : if (r) {
2610 0 : DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2611 : adev->ip_blocks[i].version->funcs->name, r);
2612 0 : return r;
2613 : }
2614 : }
2615 : }
2616 : return 0;
2617 : }
2618 :
2619 0 : static int amdgpu_device_enable_mgpu_fan_boost(void)
2620 : {
2621 : struct amdgpu_gpu_instance *gpu_ins;
2622 : struct amdgpu_device *adev;
2623 0 : int i, ret = 0;
2624 :
2625 0 : mutex_lock(&mgpu_info.mutex);
2626 :
2627 : /*
2628 : * MGPU fan boost feature should be enabled
2629 : * only when there are two or more dGPUs in
2630 : * the system
2631 : */
2632 0 : if (mgpu_info.num_dgpu < 2)
2633 : goto out;
2634 :
2635 0 : for (i = 0; i < mgpu_info.num_dgpu; i++) {
2636 0 : gpu_ins = &(mgpu_info.gpu_ins[i]);
2637 0 : adev = gpu_ins->adev;
2638 0 : if (!(adev->flags & AMD_IS_APU) &&
2639 0 : !gpu_ins->mgpu_fan_enabled) {
2640 0 : ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2641 0 : if (ret)
2642 : break;
2643 :
2644 0 : gpu_ins->mgpu_fan_enabled = 1;
2645 : }
2646 : }
2647 :
2648 : out:
2649 0 : mutex_unlock(&mgpu_info.mutex);
2650 :
2651 0 : return ret;
2652 : }
2653 :
2654 : /**
2655 : * amdgpu_device_ip_late_init - run late init for hardware IPs
2656 : *
2657 : * @adev: amdgpu_device pointer
2658 : *
2659 : * Late initialization pass for hardware IPs. The list of all the hardware
2660 : * IPs that make up the asic is walked and the late_init callbacks are run.
2661 : * late_init covers any special initialization that an IP requires
2662 : * after all of the have been initialized or something that needs to happen
2663 : * late in the init process.
2664 : * Returns 0 on success, negative error code on failure.
2665 : */
2666 0 : static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2667 : {
2668 : struct amdgpu_gpu_instance *gpu_instance;
2669 0 : int i = 0, r;
2670 :
2671 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
2672 0 : if (!adev->ip_blocks[i].status.hw)
2673 0 : continue;
2674 0 : if (adev->ip_blocks[i].version->funcs->late_init) {
2675 0 : r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2676 0 : if (r) {
2677 0 : DRM_ERROR("late_init of IP block <%s> failed %d\n",
2678 : adev->ip_blocks[i].version->funcs->name, r);
2679 0 : return r;
2680 : }
2681 : }
2682 0 : adev->ip_blocks[i].status.late_initialized = true;
2683 : }
2684 :
2685 0 : r = amdgpu_ras_late_init(adev);
2686 0 : if (r) {
2687 0 : DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2688 0 : return r;
2689 : }
2690 :
2691 0 : amdgpu_ras_set_error_query_ready(adev, true);
2692 :
2693 0 : amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2694 0 : amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2695 :
2696 0 : amdgpu_device_fill_reset_magic(adev);
2697 :
2698 0 : r = amdgpu_device_enable_mgpu_fan_boost();
2699 0 : if (r)
2700 0 : DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2701 :
2702 : /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2703 0 : if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)||
2704 : adev->asic_type == CHIP_ALDEBARAN ))
2705 0 : amdgpu_dpm_handle_passthrough_sbr(adev, true);
2706 :
2707 0 : if (adev->gmc.xgmi.num_physical_nodes > 1) {
2708 0 : mutex_lock(&mgpu_info.mutex);
2709 :
2710 : /*
2711 : * Reset device p-state to low as this was booted with high.
2712 : *
2713 : * This should be performed only after all devices from the same
2714 : * hive get initialized.
2715 : *
2716 : * However, it's unknown how many device in the hive in advance.
2717 : * As this is counted one by one during devices initializations.
2718 : *
2719 : * So, we wait for all XGMI interlinked devices initialized.
2720 : * This may bring some delays as those devices may come from
2721 : * different hives. But that should be OK.
2722 : */
2723 0 : if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2724 0 : for (i = 0; i < mgpu_info.num_gpu; i++) {
2725 0 : gpu_instance = &(mgpu_info.gpu_ins[i]);
2726 0 : if (gpu_instance->adev->flags & AMD_IS_APU)
2727 0 : continue;
2728 :
2729 0 : r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2730 : AMDGPU_XGMI_PSTATE_MIN);
2731 0 : if (r) {
2732 0 : DRM_ERROR("pstate setting failed (%d).\n", r);
2733 0 : break;
2734 : }
2735 : }
2736 : }
2737 :
2738 0 : mutex_unlock(&mgpu_info.mutex);
2739 : }
2740 :
2741 : return 0;
2742 : }
2743 :
2744 : /**
2745 : * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2746 : *
2747 : * @adev: amdgpu_device pointer
2748 : *
2749 : * For ASICs need to disable SMC first
2750 : */
2751 0 : static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2752 : {
2753 : int i, r;
2754 :
2755 0 : if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2756 : return;
2757 :
2758 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
2759 0 : if (!adev->ip_blocks[i].status.hw)
2760 0 : continue;
2761 0 : if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2762 0 : r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2763 : /* XXX handle errors */
2764 0 : if (r) {
2765 0 : DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2766 : adev->ip_blocks[i].version->funcs->name, r);
2767 : }
2768 0 : adev->ip_blocks[i].status.hw = false;
2769 0 : break;
2770 : }
2771 : }
2772 : }
2773 :
2774 0 : static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2775 : {
2776 : int i, r;
2777 :
2778 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
2779 0 : if (!adev->ip_blocks[i].version->funcs->early_fini)
2780 0 : continue;
2781 :
2782 0 : r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2783 0 : if (r) {
2784 0 : DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2785 : adev->ip_blocks[i].version->funcs->name, r);
2786 : }
2787 : }
2788 :
2789 0 : amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2790 0 : amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2791 :
2792 0 : amdgpu_amdkfd_suspend(adev, false);
2793 :
2794 : /* Workaroud for ASICs need to disable SMC first */
2795 0 : amdgpu_device_smu_fini_early(adev);
2796 :
2797 0 : for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2798 0 : if (!adev->ip_blocks[i].status.hw)
2799 0 : continue;
2800 :
2801 0 : r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2802 : /* XXX handle errors */
2803 0 : if (r) {
2804 0 : DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2805 : adev->ip_blocks[i].version->funcs->name, r);
2806 : }
2807 :
2808 0 : adev->ip_blocks[i].status.hw = false;
2809 : }
2810 :
2811 0 : if (amdgpu_sriov_vf(adev)) {
2812 0 : if (amdgpu_virt_release_full_gpu(adev, false))
2813 0 : DRM_ERROR("failed to release exclusive mode on fini\n");
2814 : }
2815 :
2816 0 : return 0;
2817 : }
2818 :
2819 : /**
2820 : * amdgpu_device_ip_fini - run fini for hardware IPs
2821 : *
2822 : * @adev: amdgpu_device pointer
2823 : *
2824 : * Main teardown pass for hardware IPs. The list of all the hardware
2825 : * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2826 : * are run. hw_fini tears down the hardware associated with each IP
2827 : * and sw_fini tears down any software state associated with each IP.
2828 : * Returns 0 on success, negative error code on failure.
2829 : */
2830 0 : static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2831 : {
2832 : int i, r;
2833 :
2834 0 : if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2835 0 : amdgpu_virt_release_ras_err_handler_data(adev);
2836 :
2837 0 : if (adev->gmc.xgmi.num_physical_nodes > 1)
2838 0 : amdgpu_xgmi_remove_device(adev);
2839 :
2840 0 : amdgpu_amdkfd_device_fini_sw(adev);
2841 :
2842 0 : for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2843 0 : if (!adev->ip_blocks[i].status.sw)
2844 0 : continue;
2845 :
2846 0 : if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2847 0 : amdgpu_ucode_free_bo(adev);
2848 0 : amdgpu_free_static_csa(&adev->virt.csa_obj);
2849 0 : amdgpu_device_wb_fini(adev);
2850 0 : amdgpu_device_vram_scratch_fini(adev);
2851 0 : amdgpu_ib_pool_fini(adev);
2852 : }
2853 :
2854 0 : r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2855 : /* XXX handle errors */
2856 0 : if (r) {
2857 0 : DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2858 : adev->ip_blocks[i].version->funcs->name, r);
2859 : }
2860 0 : adev->ip_blocks[i].status.sw = false;
2861 0 : adev->ip_blocks[i].status.valid = false;
2862 : }
2863 :
2864 0 : for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2865 0 : if (!adev->ip_blocks[i].status.late_initialized)
2866 0 : continue;
2867 0 : if (adev->ip_blocks[i].version->funcs->late_fini)
2868 0 : adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2869 0 : adev->ip_blocks[i].status.late_initialized = false;
2870 : }
2871 :
2872 0 : amdgpu_ras_fini(adev);
2873 :
2874 0 : return 0;
2875 : }
2876 :
2877 : /**
2878 : * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2879 : *
2880 : * @work: work_struct.
2881 : */
2882 0 : static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2883 : {
2884 0 : struct amdgpu_device *adev =
2885 0 : container_of(work, struct amdgpu_device, delayed_init_work.work);
2886 : int r;
2887 :
2888 0 : r = amdgpu_ib_ring_tests(adev);
2889 0 : if (r)
2890 0 : DRM_ERROR("ib ring test failed (%d).\n", r);
2891 0 : }
2892 :
2893 0 : static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2894 : {
2895 0 : struct amdgpu_device *adev =
2896 0 : container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2897 :
2898 0 : WARN_ON_ONCE(adev->gfx.gfx_off_state);
2899 0 : WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2900 :
2901 0 : if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2902 0 : adev->gfx.gfx_off_state = true;
2903 0 : }
2904 :
2905 : /**
2906 : * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2907 : *
2908 : * @adev: amdgpu_device pointer
2909 : *
2910 : * Main suspend function for hardware IPs. The list of all the hardware
2911 : * IPs that make up the asic is walked, clockgating is disabled and the
2912 : * suspend callbacks are run. suspend puts the hardware and software state
2913 : * in each IP into a state suitable for suspend.
2914 : * Returns 0 on success, negative error code on failure.
2915 : */
2916 0 : static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2917 : {
2918 : int i, r;
2919 :
2920 0 : amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2921 0 : amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2922 :
2923 0 : for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2924 0 : if (!adev->ip_blocks[i].status.valid)
2925 0 : continue;
2926 :
2927 : /* displays are handled separately */
2928 0 : if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2929 0 : continue;
2930 :
2931 : /* XXX handle errors */
2932 0 : r = adev->ip_blocks[i].version->funcs->suspend(adev);
2933 : /* XXX handle errors */
2934 0 : if (r) {
2935 0 : DRM_ERROR("suspend of IP block <%s> failed %d\n",
2936 : adev->ip_blocks[i].version->funcs->name, r);
2937 0 : return r;
2938 : }
2939 :
2940 0 : adev->ip_blocks[i].status.hw = false;
2941 : }
2942 :
2943 : return 0;
2944 : }
2945 :
2946 : /**
2947 : * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2948 : *
2949 : * @adev: amdgpu_device pointer
2950 : *
2951 : * Main suspend function for hardware IPs. The list of all the hardware
2952 : * IPs that make up the asic is walked, clockgating is disabled and the
2953 : * suspend callbacks are run. suspend puts the hardware and software state
2954 : * in each IP into a state suitable for suspend.
2955 : * Returns 0 on success, negative error code on failure.
2956 : */
2957 0 : static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2958 : {
2959 : int i, r;
2960 :
2961 0 : if (adev->in_s0ix)
2962 0 : amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
2963 :
2964 0 : for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2965 0 : if (!adev->ip_blocks[i].status.valid)
2966 0 : continue;
2967 : /* displays are handled in phase1 */
2968 0 : if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2969 0 : continue;
2970 : /* PSP lost connection when err_event_athub occurs */
2971 0 : if (amdgpu_ras_intr_triggered() &&
2972 : adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2973 0 : adev->ip_blocks[i].status.hw = false;
2974 0 : continue;
2975 : }
2976 :
2977 : /* skip unnecessary suspend if we do not initialize them yet */
2978 0 : if (adev->gmc.xgmi.pending_reset &&
2979 0 : !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2980 0 : adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2981 : adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2982 : adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2983 0 : adev->ip_blocks[i].status.hw = false;
2984 0 : continue;
2985 : }
2986 :
2987 : /* skip suspend of gfx and psp for S0ix
2988 : * gfx is in gfxoff state, so on resume it will exit gfxoff just
2989 : * like at runtime. PSP is also part of the always on hardware
2990 : * so no need to suspend it.
2991 : */
2992 0 : if (adev->in_s0ix &&
2993 0 : (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
2994 : adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX))
2995 0 : continue;
2996 :
2997 : /* XXX handle errors */
2998 0 : r = adev->ip_blocks[i].version->funcs->suspend(adev);
2999 : /* XXX handle errors */
3000 0 : if (r) {
3001 0 : DRM_ERROR("suspend of IP block <%s> failed %d\n",
3002 : adev->ip_blocks[i].version->funcs->name, r);
3003 : }
3004 0 : adev->ip_blocks[i].status.hw = false;
3005 : /* handle putting the SMC in the appropriate state */
3006 0 : if(!amdgpu_sriov_vf(adev)){
3007 0 : if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3008 0 : r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3009 0 : if (r) {
3010 0 : DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3011 : adev->mp1_state, r);
3012 0 : return r;
3013 : }
3014 : }
3015 : }
3016 : }
3017 :
3018 : return 0;
3019 : }
3020 :
3021 : /**
3022 : * amdgpu_device_ip_suspend - run suspend for hardware IPs
3023 : *
3024 : * @adev: amdgpu_device pointer
3025 : *
3026 : * Main suspend function for hardware IPs. The list of all the hardware
3027 : * IPs that make up the asic is walked, clockgating is disabled and the
3028 : * suspend callbacks are run. suspend puts the hardware and software state
3029 : * in each IP into a state suitable for suspend.
3030 : * Returns 0 on success, negative error code on failure.
3031 : */
3032 0 : int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3033 : {
3034 : int r;
3035 :
3036 0 : if (amdgpu_sriov_vf(adev)) {
3037 0 : amdgpu_virt_fini_data_exchange(adev);
3038 0 : amdgpu_virt_request_full_gpu(adev, false);
3039 : }
3040 :
3041 0 : r = amdgpu_device_ip_suspend_phase1(adev);
3042 0 : if (r)
3043 : return r;
3044 0 : r = amdgpu_device_ip_suspend_phase2(adev);
3045 :
3046 0 : if (amdgpu_sriov_vf(adev))
3047 0 : amdgpu_virt_release_full_gpu(adev, false);
3048 :
3049 : return r;
3050 : }
3051 :
3052 0 : static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3053 : {
3054 : int i, r;
3055 :
3056 : static enum amd_ip_block_type ip_order[] = {
3057 : AMD_IP_BLOCK_TYPE_GMC,
3058 : AMD_IP_BLOCK_TYPE_COMMON,
3059 : AMD_IP_BLOCK_TYPE_PSP,
3060 : AMD_IP_BLOCK_TYPE_IH,
3061 : };
3062 :
3063 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
3064 : int j;
3065 : struct amdgpu_ip_block *block;
3066 :
3067 0 : block = &adev->ip_blocks[i];
3068 0 : block->status.hw = false;
3069 :
3070 0 : for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3071 :
3072 0 : if (block->version->type != ip_order[j] ||
3073 0 : !block->status.valid)
3074 0 : continue;
3075 :
3076 0 : r = block->version->funcs->hw_init(adev);
3077 0 : DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3078 0 : if (r)
3079 : return r;
3080 0 : block->status.hw = true;
3081 : }
3082 : }
3083 :
3084 : return 0;
3085 : }
3086 :
3087 0 : static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3088 : {
3089 : int i, r;
3090 :
3091 : static enum amd_ip_block_type ip_order[] = {
3092 : AMD_IP_BLOCK_TYPE_SMC,
3093 : AMD_IP_BLOCK_TYPE_DCE,
3094 : AMD_IP_BLOCK_TYPE_GFX,
3095 : AMD_IP_BLOCK_TYPE_SDMA,
3096 : AMD_IP_BLOCK_TYPE_UVD,
3097 : AMD_IP_BLOCK_TYPE_VCE,
3098 : AMD_IP_BLOCK_TYPE_VCN
3099 : };
3100 :
3101 0 : for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3102 : int j;
3103 : struct amdgpu_ip_block *block;
3104 :
3105 0 : for (j = 0; j < adev->num_ip_blocks; j++) {
3106 0 : block = &adev->ip_blocks[j];
3107 :
3108 0 : if (block->version->type != ip_order[i] ||
3109 0 : !block->status.valid ||
3110 0 : block->status.hw)
3111 0 : continue;
3112 :
3113 0 : if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3114 0 : r = block->version->funcs->resume(adev);
3115 : else
3116 0 : r = block->version->funcs->hw_init(adev);
3117 :
3118 0 : DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3119 0 : if (r)
3120 : return r;
3121 0 : block->status.hw = true;
3122 : }
3123 : }
3124 :
3125 : return 0;
3126 : }
3127 :
3128 : /**
3129 : * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3130 : *
3131 : * @adev: amdgpu_device pointer
3132 : *
3133 : * First resume function for hardware IPs. The list of all the hardware
3134 : * IPs that make up the asic is walked and the resume callbacks are run for
3135 : * COMMON, GMC, and IH. resume puts the hardware into a functional state
3136 : * after a suspend and updates the software state as necessary. This
3137 : * function is also used for restoring the GPU after a GPU reset.
3138 : * Returns 0 on success, negative error code on failure.
3139 : */
3140 0 : static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3141 : {
3142 : int i, r;
3143 :
3144 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
3145 0 : if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3146 0 : continue;
3147 0 : if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3148 : adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3149 : adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
3150 :
3151 0 : r = adev->ip_blocks[i].version->funcs->resume(adev);
3152 0 : if (r) {
3153 0 : DRM_ERROR("resume of IP block <%s> failed %d\n",
3154 : adev->ip_blocks[i].version->funcs->name, r);
3155 0 : return r;
3156 : }
3157 0 : adev->ip_blocks[i].status.hw = true;
3158 : }
3159 : }
3160 :
3161 : return 0;
3162 : }
3163 :
3164 : /**
3165 : * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3166 : *
3167 : * @adev: amdgpu_device pointer
3168 : *
3169 : * First resume function for hardware IPs. The list of all the hardware
3170 : * IPs that make up the asic is walked and the resume callbacks are run for
3171 : * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3172 : * functional state after a suspend and updates the software state as
3173 : * necessary. This function is also used for restoring the GPU after a GPU
3174 : * reset.
3175 : * Returns 0 on success, negative error code on failure.
3176 : */
3177 0 : static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3178 : {
3179 : int i, r;
3180 :
3181 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
3182 0 : if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3183 0 : continue;
3184 0 : if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3185 : adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3186 0 : adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3187 : adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3188 0 : continue;
3189 0 : r = adev->ip_blocks[i].version->funcs->resume(adev);
3190 0 : if (r) {
3191 0 : DRM_ERROR("resume of IP block <%s> failed %d\n",
3192 : adev->ip_blocks[i].version->funcs->name, r);
3193 0 : return r;
3194 : }
3195 0 : adev->ip_blocks[i].status.hw = true;
3196 : }
3197 :
3198 : return 0;
3199 : }
3200 :
3201 : /**
3202 : * amdgpu_device_ip_resume - run resume for hardware IPs
3203 : *
3204 : * @adev: amdgpu_device pointer
3205 : *
3206 : * Main resume function for hardware IPs. The hardware IPs
3207 : * are split into two resume functions because they are
3208 : * are also used in in recovering from a GPU reset and some additional
3209 : * steps need to be take between them. In this case (S3/S4) they are
3210 : * run sequentially.
3211 : * Returns 0 on success, negative error code on failure.
3212 : */
3213 0 : static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3214 : {
3215 : int r;
3216 :
3217 0 : r = amdgpu_amdkfd_resume_iommu(adev);
3218 0 : if (r)
3219 : return r;
3220 :
3221 0 : r = amdgpu_device_ip_resume_phase1(adev);
3222 0 : if (r)
3223 : return r;
3224 :
3225 0 : r = amdgpu_device_fw_loading(adev);
3226 0 : if (r)
3227 : return r;
3228 :
3229 0 : r = amdgpu_device_ip_resume_phase2(adev);
3230 :
3231 0 : return r;
3232 : }
3233 :
3234 : /**
3235 : * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3236 : *
3237 : * @adev: amdgpu_device pointer
3238 : *
3239 : * Query the VBIOS data tables to determine if the board supports SR-IOV.
3240 : */
3241 0 : static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3242 : {
3243 0 : if (amdgpu_sriov_vf(adev)) {
3244 0 : if (adev->is_atom_fw) {
3245 0 : if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3246 0 : adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3247 : } else {
3248 0 : if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3249 0 : adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3250 : }
3251 :
3252 0 : if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3253 0 : amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3254 : }
3255 0 : }
3256 :
3257 : /**
3258 : * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3259 : *
3260 : * @asic_type: AMD asic type
3261 : *
3262 : * Check if there is DC (new modesetting infrastructre) support for an asic.
3263 : * returns true if DC has support, false if not.
3264 : */
3265 0 : bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3266 : {
3267 : switch (asic_type) {
3268 : #ifdef CONFIG_DRM_AMDGPU_SI
3269 : case CHIP_HAINAN:
3270 : #endif
3271 : case CHIP_TOPAZ:
3272 : /* chips with no display hardware */
3273 : return false;
3274 : #if defined(CONFIG_DRM_AMD_DC)
3275 : case CHIP_TAHITI:
3276 : case CHIP_PITCAIRN:
3277 : case CHIP_VERDE:
3278 : case CHIP_OLAND:
3279 : /*
3280 : * We have systems in the wild with these ASICs that require
3281 : * LVDS and VGA support which is not supported with DC.
3282 : *
3283 : * Fallback to the non-DC driver here by default so as not to
3284 : * cause regressions.
3285 : */
3286 : #if defined(CONFIG_DRM_AMD_DC_SI)
3287 : return amdgpu_dc > 0;
3288 : #else
3289 : return false;
3290 : #endif
3291 : case CHIP_BONAIRE:
3292 : case CHIP_KAVERI:
3293 : case CHIP_KABINI:
3294 : case CHIP_MULLINS:
3295 : /*
3296 : * We have systems in the wild with these ASICs that require
3297 : * VGA support which is not supported with DC.
3298 : *
3299 : * Fallback to the non-DC driver here by default so as not to
3300 : * cause regressions.
3301 : */
3302 0 : return amdgpu_dc > 0;
3303 : default:
3304 0 : return amdgpu_dc != 0;
3305 : #else
3306 : default:
3307 : if (amdgpu_dc > 0)
3308 : DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3309 : "but isn't supported by ASIC, ignoring\n");
3310 : return false;
3311 : #endif
3312 : }
3313 : }
3314 :
3315 : /**
3316 : * amdgpu_device_has_dc_support - check if dc is supported
3317 : *
3318 : * @adev: amdgpu_device pointer
3319 : *
3320 : * Returns true for supported, false for not supported
3321 : */
3322 0 : bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3323 : {
3324 0 : if (amdgpu_sriov_vf(adev) ||
3325 0 : adev->enable_virtual_display ||
3326 0 : (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3327 : return false;
3328 :
3329 0 : return amdgpu_device_asic_has_dc_support(adev->asic_type);
3330 : }
3331 :
3332 0 : static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3333 : {
3334 0 : struct amdgpu_device *adev =
3335 0 : container_of(__work, struct amdgpu_device, xgmi_reset_work);
3336 0 : struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3337 :
3338 : /* It's a bug to not have a hive within this function */
3339 0 : if (WARN_ON(!hive))
3340 : return;
3341 :
3342 : /*
3343 : * Use task barrier to synchronize all xgmi reset works across the
3344 : * hive. task_barrier_enter and task_barrier_exit will block
3345 : * until all the threads running the xgmi reset works reach
3346 : * those points. task_barrier_full will do both blocks.
3347 : */
3348 0 : if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3349 :
3350 0 : task_barrier_enter(&hive->tb);
3351 0 : adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3352 :
3353 0 : if (adev->asic_reset_res)
3354 : goto fail;
3355 :
3356 0 : task_barrier_exit(&hive->tb);
3357 0 : adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3358 :
3359 0 : if (adev->asic_reset_res)
3360 : goto fail;
3361 :
3362 0 : if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3363 0 : adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3364 0 : adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3365 : } else {
3366 :
3367 0 : task_barrier_full(&hive->tb);
3368 0 : adev->asic_reset_res = amdgpu_asic_reset(adev);
3369 : }
3370 :
3371 : fail:
3372 0 : if (adev->asic_reset_res)
3373 0 : DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3374 : adev->asic_reset_res, adev_to_drm(adev)->unique);
3375 0 : amdgpu_put_xgmi_hive(hive);
3376 : }
3377 :
3378 0 : static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3379 : {
3380 0 : char *input = amdgpu_lockup_timeout;
3381 0 : char *timeout_setting = NULL;
3382 0 : int index = 0;
3383 : long timeout;
3384 0 : int ret = 0;
3385 :
3386 : /*
3387 : * By default timeout for non compute jobs is 10000
3388 : * and 60000 for compute jobs.
3389 : * In SR-IOV or passthrough mode, timeout for compute
3390 : * jobs are 60000 by default.
3391 : */
3392 0 : adev->gfx_timeout = msecs_to_jiffies(10000);
3393 0 : adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3394 0 : if (amdgpu_sriov_vf(adev))
3395 0 : adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3396 0 : msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3397 : else
3398 0 : adev->compute_timeout = msecs_to_jiffies(60000);
3399 :
3400 0 : if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3401 0 : while ((timeout_setting = strsep(&input, ",")) &&
3402 0 : strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3403 0 : ret = kstrtol(timeout_setting, 0, &timeout);
3404 0 : if (ret)
3405 : return ret;
3406 :
3407 0 : if (timeout == 0) {
3408 0 : index++;
3409 0 : continue;
3410 0 : } else if (timeout < 0) {
3411 0 : timeout = MAX_SCHEDULE_TIMEOUT;
3412 0 : dev_warn(adev->dev, "lockup timeout disabled");
3413 0 : add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3414 : } else {
3415 0 : timeout = msecs_to_jiffies(timeout);
3416 : }
3417 :
3418 0 : switch (index++) {
3419 : case 0:
3420 0 : adev->gfx_timeout = timeout;
3421 0 : break;
3422 : case 1:
3423 0 : adev->compute_timeout = timeout;
3424 0 : break;
3425 : case 2:
3426 0 : adev->sdma_timeout = timeout;
3427 0 : break;
3428 : case 3:
3429 0 : adev->video_timeout = timeout;
3430 0 : break;
3431 : default:
3432 : break;
3433 : }
3434 : }
3435 : /*
3436 : * There is only one value specified and
3437 : * it should apply to all non-compute jobs.
3438 : */
3439 0 : if (index == 1) {
3440 0 : adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3441 0 : if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3442 0 : adev->compute_timeout = adev->gfx_timeout;
3443 : }
3444 : }
3445 :
3446 : return ret;
3447 : }
3448 :
3449 : /**
3450 : * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3451 : *
3452 : * @adev: amdgpu_device pointer
3453 : *
3454 : * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3455 : */
3456 : static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3457 : {
3458 : struct iommu_domain *domain;
3459 :
3460 0 : domain = iommu_get_domain_for_dev(adev->dev);
3461 : if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3462 0 : adev->ram_is_direct_mapped = true;
3463 : }
3464 :
3465 : static const struct attribute *amdgpu_dev_attributes[] = {
3466 : &dev_attr_product_name.attr,
3467 : &dev_attr_product_number.attr,
3468 : &dev_attr_serial_number.attr,
3469 : &dev_attr_pcie_replay_count.attr,
3470 : NULL
3471 : };
3472 :
3473 : /**
3474 : * amdgpu_device_init - initialize the driver
3475 : *
3476 : * @adev: amdgpu_device pointer
3477 : * @flags: driver flags
3478 : *
3479 : * Initializes the driver info and hw (all asics).
3480 : * Returns 0 for success or an error on failure.
3481 : * Called at driver startup.
3482 : */
3483 0 : int amdgpu_device_init(struct amdgpu_device *adev,
3484 : uint32_t flags)
3485 : {
3486 0 : struct drm_device *ddev = adev_to_drm(adev);
3487 0 : struct pci_dev *pdev = adev->pdev;
3488 : int r, i;
3489 0 : bool px = false;
3490 : u32 max_MBps;
3491 :
3492 0 : adev->shutdown = false;
3493 0 : adev->flags = flags;
3494 :
3495 0 : if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3496 0 : adev->asic_type = amdgpu_force_asic_type;
3497 : else
3498 0 : adev->asic_type = flags & AMD_ASIC_MASK;
3499 :
3500 0 : adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3501 0 : if (amdgpu_emu_mode == 1)
3502 0 : adev->usec_timeout *= 10;
3503 0 : adev->gmc.gart_size = 512 * 1024 * 1024;
3504 0 : adev->accel_working = false;
3505 0 : adev->num_rings = 0;
3506 0 : adev->mman.buffer_funcs = NULL;
3507 0 : adev->mman.buffer_funcs_ring = NULL;
3508 0 : adev->vm_manager.vm_pte_funcs = NULL;
3509 0 : adev->vm_manager.vm_pte_num_scheds = 0;
3510 0 : adev->gmc.gmc_funcs = NULL;
3511 0 : adev->harvest_ip_mask = 0x0;
3512 0 : adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3513 0 : bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3514 :
3515 0 : adev->smc_rreg = &amdgpu_invalid_rreg;
3516 0 : adev->smc_wreg = &amdgpu_invalid_wreg;
3517 0 : adev->pcie_rreg = &amdgpu_invalid_rreg;
3518 0 : adev->pcie_wreg = &amdgpu_invalid_wreg;
3519 0 : adev->pciep_rreg = &amdgpu_invalid_rreg;
3520 0 : adev->pciep_wreg = &amdgpu_invalid_wreg;
3521 0 : adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3522 0 : adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3523 0 : adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3524 0 : adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3525 0 : adev->didt_rreg = &amdgpu_invalid_rreg;
3526 0 : adev->didt_wreg = &amdgpu_invalid_wreg;
3527 0 : adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3528 0 : adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3529 0 : adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3530 0 : adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3531 :
3532 0 : DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3533 : amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3534 : pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3535 :
3536 : /* mutex initialization are all done here so we
3537 : * can recall function without having locking issues */
3538 0 : mutex_init(&adev->firmware.mutex);
3539 0 : mutex_init(&adev->pm.mutex);
3540 0 : mutex_init(&adev->gfx.gpu_clock_mutex);
3541 0 : mutex_init(&adev->srbm_mutex);
3542 0 : mutex_init(&adev->gfx.pipe_reserve_mutex);
3543 0 : mutex_init(&adev->gfx.gfx_off_mutex);
3544 0 : mutex_init(&adev->grbm_idx_mutex);
3545 0 : mutex_init(&adev->mn_lock);
3546 0 : mutex_init(&adev->virt.vf_errors.lock);
3547 0 : hash_init(adev->mn_hash);
3548 0 : mutex_init(&adev->psp.mutex);
3549 0 : mutex_init(&adev->notifier_lock);
3550 0 : mutex_init(&adev->pm.stable_pstate_ctx_lock);
3551 0 : mutex_init(&adev->benchmark_mutex);
3552 :
3553 0 : amdgpu_device_init_apu_flags(adev);
3554 :
3555 0 : r = amdgpu_device_check_arguments(adev);
3556 0 : if (r)
3557 : return r;
3558 :
3559 0 : spin_lock_init(&adev->mmio_idx_lock);
3560 0 : spin_lock_init(&adev->smc_idx_lock);
3561 0 : spin_lock_init(&adev->pcie_idx_lock);
3562 0 : spin_lock_init(&adev->uvd_ctx_idx_lock);
3563 0 : spin_lock_init(&adev->didt_idx_lock);
3564 0 : spin_lock_init(&adev->gc_cac_idx_lock);
3565 0 : spin_lock_init(&adev->se_cac_idx_lock);
3566 0 : spin_lock_init(&adev->audio_endpt_idx_lock);
3567 0 : spin_lock_init(&adev->mm_stats.lock);
3568 :
3569 0 : INIT_LIST_HEAD(&adev->shadow_list);
3570 0 : mutex_init(&adev->shadow_list_lock);
3571 :
3572 0 : INIT_LIST_HEAD(&adev->reset_list);
3573 :
3574 0 : INIT_LIST_HEAD(&adev->ras_list);
3575 :
3576 0 : INIT_DELAYED_WORK(&adev->delayed_init_work,
3577 : amdgpu_device_delayed_init_work_handler);
3578 0 : INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3579 : amdgpu_device_delay_enable_gfx_off);
3580 :
3581 0 : INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3582 :
3583 0 : adev->gfx.gfx_off_req_count = 1;
3584 0 : adev->gfx.gfx_off_residency = 0;
3585 0 : adev->gfx.gfx_off_entrycount = 0;
3586 0 : adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3587 :
3588 0 : atomic_set(&adev->throttling_logging_enabled, 1);
3589 : /*
3590 : * If throttling continues, logging will be performed every minute
3591 : * to avoid log flooding. "-1" is subtracted since the thermal
3592 : * throttling interrupt comes every second. Thus, the total logging
3593 : * interval is 59 seconds(retelimited printk interval) + 1(waiting
3594 : * for throttling interrupt) = 60 seconds.
3595 : */
3596 0 : ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3597 0 : ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3598 :
3599 : /* Registers mapping */
3600 : /* TODO: block userspace mapping of io register */
3601 0 : if (adev->asic_type >= CHIP_BONAIRE) {
3602 0 : adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3603 0 : adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3604 : } else {
3605 0 : adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3606 0 : adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3607 : }
3608 :
3609 0 : for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3610 0 : atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3611 :
3612 0 : adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3613 0 : if (adev->rmmio == NULL) {
3614 : return -ENOMEM;
3615 : }
3616 0 : DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3617 0 : DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3618 :
3619 0 : amdgpu_device_get_pcie_info(adev);
3620 :
3621 0 : if (amdgpu_mcbp)
3622 0 : DRM_INFO("MCBP is enabled\n");
3623 :
3624 : /*
3625 : * Reset domain needs to be present early, before XGMI hive discovered
3626 : * (if any) and intitialized to use reset sem and in_gpu reset flag
3627 : * early on during init and before calling to RREG32.
3628 : */
3629 0 : adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3630 0 : if (!adev->reset_domain)
3631 : return -ENOMEM;
3632 :
3633 : /* detect hw virtualization here */
3634 0 : amdgpu_detect_virtualization(adev);
3635 :
3636 0 : r = amdgpu_device_get_job_timeout_settings(adev);
3637 0 : if (r) {
3638 0 : dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3639 0 : return r;
3640 : }
3641 :
3642 : /* early init functions */
3643 0 : r = amdgpu_device_ip_early_init(adev);
3644 0 : if (r)
3645 : return r;
3646 :
3647 : /* Enable TMZ based on IP_VERSION */
3648 0 : amdgpu_gmc_tmz_set(adev);
3649 :
3650 0 : amdgpu_gmc_noretry_set(adev);
3651 : /* Need to get xgmi info early to decide the reset behavior*/
3652 0 : if (adev->gmc.xgmi.supported) {
3653 0 : r = adev->gfxhub.funcs->get_xgmi_info(adev);
3654 0 : if (r)
3655 : return r;
3656 : }
3657 :
3658 : /* enable PCIE atomic ops */
3659 0 : if (amdgpu_sriov_vf(adev))
3660 0 : adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3661 0 : adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3662 : (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3663 : else
3664 0 : adev->have_atomics_support =
3665 0 : !pci_enable_atomic_ops_to_root(adev->pdev,
3666 : PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3667 : PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3668 0 : if (!adev->have_atomics_support)
3669 0 : dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3670 :
3671 : /* doorbell bar mapping and doorbell index init*/
3672 0 : amdgpu_device_doorbell_init(adev);
3673 :
3674 0 : if (amdgpu_emu_mode == 1) {
3675 : /* post the asic on emulation mode */
3676 0 : emu_soc_asic_init(adev);
3677 0 : goto fence_driver_init;
3678 : }
3679 :
3680 0 : amdgpu_reset_init(adev);
3681 :
3682 : /* detect if we are with an SRIOV vbios */
3683 0 : amdgpu_device_detect_sriov_bios(adev);
3684 :
3685 : /* check if we need to reset the asic
3686 : * E.g., driver was not cleanly unloaded previously, etc.
3687 : */
3688 0 : if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3689 0 : if (adev->gmc.xgmi.num_physical_nodes) {
3690 0 : dev_info(adev->dev, "Pending hive reset.\n");
3691 0 : adev->gmc.xgmi.pending_reset = true;
3692 : /* Only need to init necessary block for SMU to handle the reset */
3693 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
3694 0 : if (!adev->ip_blocks[i].status.valid)
3695 0 : continue;
3696 0 : if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3697 : adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3698 : adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3699 : adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3700 0 : DRM_DEBUG("IP %s disabled for hw_init.\n",
3701 : adev->ip_blocks[i].version->funcs->name);
3702 0 : adev->ip_blocks[i].status.hw = true;
3703 : }
3704 : }
3705 : } else {
3706 0 : r = amdgpu_asic_reset(adev);
3707 0 : if (r) {
3708 0 : dev_err(adev->dev, "asic reset on init failed\n");
3709 0 : goto failed;
3710 : }
3711 : }
3712 : }
3713 :
3714 0 : pci_enable_pcie_error_reporting(adev->pdev);
3715 :
3716 : /* Post card if necessary */
3717 0 : if (amdgpu_device_need_post(adev)) {
3718 0 : if (!adev->bios) {
3719 0 : dev_err(adev->dev, "no vBIOS found\n");
3720 0 : r = -EINVAL;
3721 0 : goto failed;
3722 : }
3723 0 : DRM_INFO("GPU posting now...\n");
3724 0 : r = amdgpu_device_asic_init(adev);
3725 0 : if (r) {
3726 0 : dev_err(adev->dev, "gpu post error!\n");
3727 0 : goto failed;
3728 : }
3729 : }
3730 :
3731 0 : if (adev->is_atom_fw) {
3732 : /* Initialize clocks */
3733 0 : r = amdgpu_atomfirmware_get_clock_info(adev);
3734 0 : if (r) {
3735 0 : dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3736 0 : amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3737 0 : goto failed;
3738 : }
3739 : } else {
3740 : /* Initialize clocks */
3741 0 : r = amdgpu_atombios_get_clock_info(adev);
3742 0 : if (r) {
3743 0 : dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3744 0 : amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3745 0 : goto failed;
3746 : }
3747 : /* init i2c buses */
3748 0 : if (!amdgpu_device_has_dc_support(adev))
3749 0 : amdgpu_atombios_i2c_init(adev);
3750 : }
3751 :
3752 : fence_driver_init:
3753 : /* Fence driver */
3754 0 : r = amdgpu_fence_driver_sw_init(adev);
3755 0 : if (r) {
3756 0 : dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3757 0 : amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3758 0 : goto failed;
3759 : }
3760 :
3761 : /* init the mode config */
3762 0 : drm_mode_config_init(adev_to_drm(adev));
3763 :
3764 0 : r = amdgpu_device_ip_init(adev);
3765 0 : if (r) {
3766 : /* failed in exclusive mode due to timeout */
3767 0 : if (amdgpu_sriov_vf(adev) &&
3768 0 : !amdgpu_sriov_runtime(adev) &&
3769 0 : amdgpu_virt_mmio_blocked(adev) &&
3770 0 : !amdgpu_virt_wait_reset(adev)) {
3771 0 : dev_err(adev->dev, "VF exclusive mode timeout\n");
3772 : /* Don't send request since VF is inactive. */
3773 0 : adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3774 0 : adev->virt.ops = NULL;
3775 0 : r = -EAGAIN;
3776 0 : goto release_ras_con;
3777 : }
3778 0 : dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3779 0 : amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3780 0 : goto release_ras_con;
3781 : }
3782 :
3783 0 : amdgpu_fence_driver_hw_init(adev);
3784 :
3785 0 : dev_info(adev->dev,
3786 : "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3787 : adev->gfx.config.max_shader_engines,
3788 : adev->gfx.config.max_sh_per_se,
3789 : adev->gfx.config.max_cu_per_sh,
3790 : adev->gfx.cu_info.number);
3791 :
3792 0 : adev->accel_working = true;
3793 :
3794 0 : amdgpu_vm_check_compute_bug(adev);
3795 :
3796 : /* Initialize the buffer migration limit. */
3797 0 : if (amdgpu_moverate >= 0)
3798 0 : max_MBps = amdgpu_moverate;
3799 : else
3800 : max_MBps = 8; /* Allow 8 MB/s. */
3801 : /* Get a log2 for easy divisions. */
3802 0 : adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3803 :
3804 0 : r = amdgpu_pm_sysfs_init(adev);
3805 0 : if (r) {
3806 0 : adev->pm_sysfs_en = false;
3807 0 : DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3808 : } else
3809 0 : adev->pm_sysfs_en = true;
3810 :
3811 0 : r = amdgpu_ucode_sysfs_init(adev);
3812 0 : if (r) {
3813 0 : adev->ucode_sysfs_en = false;
3814 0 : DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3815 : } else
3816 0 : adev->ucode_sysfs_en = true;
3817 :
3818 0 : r = amdgpu_psp_sysfs_init(adev);
3819 0 : if (r) {
3820 0 : adev->psp_sysfs_en = false;
3821 0 : if (!amdgpu_sriov_vf(adev))
3822 0 : DRM_ERROR("Creating psp sysfs failed\n");
3823 : } else
3824 0 : adev->psp_sysfs_en = true;
3825 :
3826 : /*
3827 : * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3828 : * Otherwise the mgpu fan boost feature will be skipped due to the
3829 : * gpu instance is counted less.
3830 : */
3831 0 : amdgpu_register_gpu_instance(adev);
3832 :
3833 : /* enable clockgating, etc. after ib tests, etc. since some blocks require
3834 : * explicit gating rather than handling it automatically.
3835 : */
3836 0 : if (!adev->gmc.xgmi.pending_reset) {
3837 0 : r = amdgpu_device_ip_late_init(adev);
3838 0 : if (r) {
3839 0 : dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3840 0 : amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3841 0 : goto release_ras_con;
3842 : }
3843 : /* must succeed. */
3844 0 : amdgpu_ras_resume(adev);
3845 0 : queue_delayed_work(system_wq, &adev->delayed_init_work,
3846 : msecs_to_jiffies(AMDGPU_RESUME_MS));
3847 : }
3848 :
3849 0 : if (amdgpu_sriov_vf(adev))
3850 0 : flush_delayed_work(&adev->delayed_init_work);
3851 :
3852 0 : r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3853 0 : if (r)
3854 0 : dev_err(adev->dev, "Could not create amdgpu device attr\n");
3855 :
3856 : if (IS_ENABLED(CONFIG_PERF_EVENTS))
3857 : r = amdgpu_pmu_init(adev);
3858 0 : if (r)
3859 0 : dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3860 :
3861 : /* Have stored pci confspace at hand for restore in sudden PCI error */
3862 0 : if (amdgpu_device_cache_pci_state(adev->pdev))
3863 0 : pci_restore_state(pdev);
3864 :
3865 : /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3866 : /* this will fail for cards that aren't VGA class devices, just
3867 : * ignore it */
3868 0 : if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3869 0 : vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3870 :
3871 0 : if (amdgpu_device_supports_px(ddev)) {
3872 : px = true;
3873 : vga_switcheroo_register_client(adev->pdev,
3874 : &amdgpu_switcheroo_ops, px);
3875 : vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3876 : }
3877 :
3878 0 : if (adev->gmc.xgmi.pending_reset)
3879 0 : queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3880 : msecs_to_jiffies(AMDGPU_RESUME_MS));
3881 :
3882 0 : amdgpu_device_check_iommu_direct_map(adev);
3883 :
3884 0 : return 0;
3885 :
3886 : release_ras_con:
3887 0 : amdgpu_release_ras_context(adev);
3888 :
3889 : failed:
3890 0 : amdgpu_vf_error_trans_all(adev);
3891 :
3892 0 : return r;
3893 : }
3894 :
3895 0 : static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3896 : {
3897 :
3898 : /* Clear all CPU mappings pointing to this device */
3899 0 : unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3900 :
3901 : /* Unmap all mapped bars - Doorbell, registers and VRAM */
3902 0 : amdgpu_device_doorbell_fini(adev);
3903 :
3904 0 : iounmap(adev->rmmio);
3905 0 : adev->rmmio = NULL;
3906 0 : if (adev->mman.aper_base_kaddr)
3907 0 : iounmap(adev->mman.aper_base_kaddr);
3908 0 : adev->mman.aper_base_kaddr = NULL;
3909 :
3910 : /* Memory manager related */
3911 : if (!adev->gmc.xgmi.connected_to_cpu) {
3912 : arch_phys_wc_del(adev->gmc.vram_mtrr);
3913 : arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3914 : }
3915 0 : }
3916 :
3917 : /**
3918 : * amdgpu_device_fini_hw - tear down the driver
3919 : *
3920 : * @adev: amdgpu_device pointer
3921 : *
3922 : * Tear down the driver info (all asics).
3923 : * Called at driver shutdown.
3924 : */
3925 0 : void amdgpu_device_fini_hw(struct amdgpu_device *adev)
3926 : {
3927 0 : dev_info(adev->dev, "amdgpu: finishing device.\n");
3928 0 : flush_delayed_work(&adev->delayed_init_work);
3929 0 : adev->shutdown = true;
3930 :
3931 : /* make sure IB test finished before entering exclusive mode
3932 : * to avoid preemption on IB test
3933 : * */
3934 0 : if (amdgpu_sriov_vf(adev)) {
3935 0 : amdgpu_virt_request_full_gpu(adev, false);
3936 0 : amdgpu_virt_fini_data_exchange(adev);
3937 : }
3938 :
3939 : /* disable all interrupts */
3940 0 : amdgpu_irq_disable_all(adev);
3941 0 : if (adev->mode_info.mode_config_initialized){
3942 0 : if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
3943 0 : drm_helper_force_disable_all(adev_to_drm(adev));
3944 : else
3945 0 : drm_atomic_helper_shutdown(adev_to_drm(adev));
3946 : }
3947 0 : amdgpu_fence_driver_hw_fini(adev);
3948 :
3949 0 : if (adev->mman.initialized) {
3950 0 : flush_delayed_work(&adev->mman.bdev.wq);
3951 0 : ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
3952 : }
3953 :
3954 0 : if (adev->pm_sysfs_en)
3955 0 : amdgpu_pm_sysfs_fini(adev);
3956 0 : if (adev->ucode_sysfs_en)
3957 0 : amdgpu_ucode_sysfs_fini(adev);
3958 0 : if (adev->psp_sysfs_en)
3959 0 : amdgpu_psp_sysfs_fini(adev);
3960 0 : sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3961 :
3962 : /* disable ras feature must before hw fini */
3963 0 : amdgpu_ras_pre_fini(adev);
3964 :
3965 0 : amdgpu_device_ip_fini_early(adev);
3966 :
3967 0 : amdgpu_irq_fini_hw(adev);
3968 :
3969 0 : if (adev->mman.initialized)
3970 0 : ttm_device_clear_dma_mappings(&adev->mman.bdev);
3971 :
3972 0 : amdgpu_gart_dummy_page_fini(adev);
3973 :
3974 0 : amdgpu_device_unmap_mmio(adev);
3975 :
3976 0 : }
3977 :
3978 0 : void amdgpu_device_fini_sw(struct amdgpu_device *adev)
3979 : {
3980 : int idx;
3981 :
3982 0 : amdgpu_fence_driver_sw_fini(adev);
3983 0 : amdgpu_device_ip_fini(adev);
3984 0 : release_firmware(adev->firmware.gpu_info_fw);
3985 0 : adev->firmware.gpu_info_fw = NULL;
3986 0 : adev->accel_working = false;
3987 :
3988 0 : amdgpu_reset_fini(adev);
3989 :
3990 : /* free i2c buses */
3991 0 : if (!amdgpu_device_has_dc_support(adev))
3992 0 : amdgpu_i2c_fini(adev);
3993 :
3994 0 : if (amdgpu_emu_mode != 1)
3995 0 : amdgpu_atombios_fini(adev);
3996 :
3997 0 : kfree(adev->bios);
3998 0 : adev->bios = NULL;
3999 0 : if (amdgpu_device_supports_px(adev_to_drm(adev))) {
4000 : vga_switcheroo_unregister_client(adev->pdev);
4001 : vga_switcheroo_fini_domain_pm_ops(adev->dev);
4002 : }
4003 0 : if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4004 0 : vga_client_unregister(adev->pdev);
4005 :
4006 0 : if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4007 :
4008 0 : iounmap(adev->rmmio);
4009 0 : adev->rmmio = NULL;
4010 0 : amdgpu_device_doorbell_fini(adev);
4011 0 : drm_dev_exit(idx);
4012 : }
4013 :
4014 : if (IS_ENABLED(CONFIG_PERF_EVENTS))
4015 : amdgpu_pmu_fini(adev);
4016 0 : if (adev->mman.discovery_bin)
4017 0 : amdgpu_discovery_fini(adev);
4018 :
4019 0 : amdgpu_reset_put_reset_domain(adev->reset_domain);
4020 0 : adev->reset_domain = NULL;
4021 :
4022 0 : kfree(adev->pci_state);
4023 :
4024 0 : }
4025 :
4026 : /**
4027 : * amdgpu_device_evict_resources - evict device resources
4028 : * @adev: amdgpu device object
4029 : *
4030 : * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4031 : * of the vram memory type. Mainly used for evicting device resources
4032 : * at suspend time.
4033 : *
4034 : */
4035 0 : static void amdgpu_device_evict_resources(struct amdgpu_device *adev)
4036 : {
4037 : /* No need to evict vram on APUs for suspend to ram or s2idle */
4038 0 : if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4039 : return;
4040 :
4041 0 : if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM))
4042 0 : DRM_WARN("evicting device resources failed\n");
4043 :
4044 : }
4045 :
4046 : /*
4047 : * Suspend & resume.
4048 : */
4049 : /**
4050 : * amdgpu_device_suspend - initiate device suspend
4051 : *
4052 : * @dev: drm dev pointer
4053 : * @fbcon : notify the fbdev of suspend
4054 : *
4055 : * Puts the hw in the suspend state (all asics).
4056 : * Returns 0 for success or an error on failure.
4057 : * Called at driver suspend.
4058 : */
4059 0 : int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4060 : {
4061 0 : struct amdgpu_device *adev = drm_to_adev(dev);
4062 :
4063 0 : if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4064 : return 0;
4065 :
4066 0 : adev->in_suspend = true;
4067 :
4068 0 : if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4069 : DRM_WARN("smart shift update failed\n");
4070 :
4071 0 : drm_kms_helper_poll_disable(dev);
4072 :
4073 : if (fbcon)
4074 : drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4075 :
4076 0 : cancel_delayed_work_sync(&adev->delayed_init_work);
4077 :
4078 0 : amdgpu_ras_suspend(adev);
4079 :
4080 0 : amdgpu_device_ip_suspend_phase1(adev);
4081 :
4082 0 : if (!adev->in_s0ix)
4083 0 : amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4084 :
4085 0 : amdgpu_device_evict_resources(adev);
4086 :
4087 0 : amdgpu_fence_driver_hw_fini(adev);
4088 :
4089 0 : amdgpu_device_ip_suspend_phase2(adev);
4090 :
4091 0 : return 0;
4092 : }
4093 :
4094 : /**
4095 : * amdgpu_device_resume - initiate device resume
4096 : *
4097 : * @dev: drm dev pointer
4098 : * @fbcon : notify the fbdev of resume
4099 : *
4100 : * Bring the hw back to operating state (all asics).
4101 : * Returns 0 for success or an error on failure.
4102 : * Called at driver resume.
4103 : */
4104 0 : int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4105 : {
4106 0 : struct amdgpu_device *adev = drm_to_adev(dev);
4107 0 : int r = 0;
4108 :
4109 0 : if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4110 : return 0;
4111 :
4112 0 : if (adev->in_s0ix)
4113 0 : amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4114 :
4115 : /* post card */
4116 0 : if (amdgpu_device_need_post(adev)) {
4117 0 : r = amdgpu_device_asic_init(adev);
4118 0 : if (r)
4119 0 : dev_err(adev->dev, "amdgpu asic init failed\n");
4120 : }
4121 :
4122 0 : r = amdgpu_device_ip_resume(adev);
4123 0 : if (r) {
4124 0 : dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4125 0 : return r;
4126 : }
4127 0 : amdgpu_fence_driver_hw_init(adev);
4128 :
4129 0 : r = amdgpu_device_ip_late_init(adev);
4130 0 : if (r)
4131 : return r;
4132 :
4133 0 : queue_delayed_work(system_wq, &adev->delayed_init_work,
4134 : msecs_to_jiffies(AMDGPU_RESUME_MS));
4135 :
4136 0 : if (!adev->in_s0ix) {
4137 0 : r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4138 0 : if (r)
4139 : return r;
4140 : }
4141 :
4142 : /* Make sure IB tests flushed */
4143 0 : flush_delayed_work(&adev->delayed_init_work);
4144 :
4145 : if (fbcon)
4146 : drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4147 :
4148 0 : drm_kms_helper_poll_enable(dev);
4149 :
4150 0 : amdgpu_ras_resume(adev);
4151 :
4152 : /*
4153 : * Most of the connector probing functions try to acquire runtime pm
4154 : * refs to ensure that the GPU is powered on when connector polling is
4155 : * performed. Since we're calling this from a runtime PM callback,
4156 : * trying to acquire rpm refs will cause us to deadlock.
4157 : *
4158 : * Since we're guaranteed to be holding the rpm lock, it's safe to
4159 : * temporarily disable the rpm helpers so this doesn't deadlock us.
4160 : */
4161 : #ifdef CONFIG_PM
4162 0 : dev->dev->power.disable_depth++;
4163 : #endif
4164 0 : if (!amdgpu_device_has_dc_support(adev))
4165 0 : drm_helper_hpd_irq_event(dev);
4166 : else
4167 0 : drm_kms_helper_hotplug_event(dev);
4168 : #ifdef CONFIG_PM
4169 0 : dev->dev->power.disable_depth--;
4170 : #endif
4171 0 : adev->in_suspend = false;
4172 :
4173 0 : if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4174 : DRM_WARN("smart shift update failed\n");
4175 :
4176 0 : return 0;
4177 : }
4178 :
4179 : /**
4180 : * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4181 : *
4182 : * @adev: amdgpu_device pointer
4183 : *
4184 : * The list of all the hardware IPs that make up the asic is walked and
4185 : * the check_soft_reset callbacks are run. check_soft_reset determines
4186 : * if the asic is still hung or not.
4187 : * Returns true if any of the IPs are still in a hung state, false if not.
4188 : */
4189 0 : static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4190 : {
4191 : int i;
4192 0 : bool asic_hang = false;
4193 :
4194 0 : if (amdgpu_sriov_vf(adev))
4195 : return true;
4196 :
4197 0 : if (amdgpu_asic_need_full_reset(adev))
4198 : return true;
4199 :
4200 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
4201 0 : if (!adev->ip_blocks[i].status.valid)
4202 0 : continue;
4203 0 : if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4204 0 : adev->ip_blocks[i].status.hang =
4205 0 : adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4206 0 : if (adev->ip_blocks[i].status.hang) {
4207 0 : dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4208 0 : asic_hang = true;
4209 : }
4210 : }
4211 : return asic_hang;
4212 : }
4213 :
4214 : /**
4215 : * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4216 : *
4217 : * @adev: amdgpu_device pointer
4218 : *
4219 : * The list of all the hardware IPs that make up the asic is walked and the
4220 : * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
4221 : * handles any IP specific hardware or software state changes that are
4222 : * necessary for a soft reset to succeed.
4223 : * Returns 0 on success, negative error code on failure.
4224 : */
4225 0 : static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4226 : {
4227 0 : int i, r = 0;
4228 :
4229 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
4230 0 : if (!adev->ip_blocks[i].status.valid)
4231 0 : continue;
4232 0 : if (adev->ip_blocks[i].status.hang &&
4233 0 : adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4234 0 : r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4235 0 : if (r)
4236 : return r;
4237 : }
4238 : }
4239 :
4240 : return 0;
4241 : }
4242 :
4243 : /**
4244 : * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4245 : *
4246 : * @adev: amdgpu_device pointer
4247 : *
4248 : * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
4249 : * reset is necessary to recover.
4250 : * Returns true if a full asic reset is required, false if not.
4251 : */
4252 0 : static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4253 : {
4254 : int i;
4255 :
4256 0 : if (amdgpu_asic_need_full_reset(adev))
4257 : return true;
4258 :
4259 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
4260 0 : if (!adev->ip_blocks[i].status.valid)
4261 0 : continue;
4262 0 : if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4263 0 : (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4264 0 : (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4265 0 : (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4266 : adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4267 0 : if (adev->ip_blocks[i].status.hang) {
4268 0 : dev_info(adev->dev, "Some block need full reset!\n");
4269 0 : return true;
4270 : }
4271 : }
4272 : }
4273 : return false;
4274 : }
4275 :
4276 : /**
4277 : * amdgpu_device_ip_soft_reset - do a soft reset
4278 : *
4279 : * @adev: amdgpu_device pointer
4280 : *
4281 : * The list of all the hardware IPs that make up the asic is walked and the
4282 : * soft_reset callbacks are run if the block is hung. soft_reset handles any
4283 : * IP specific hardware or software state changes that are necessary to soft
4284 : * reset the IP.
4285 : * Returns 0 on success, negative error code on failure.
4286 : */
4287 0 : static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4288 : {
4289 0 : int i, r = 0;
4290 :
4291 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
4292 0 : if (!adev->ip_blocks[i].status.valid)
4293 0 : continue;
4294 0 : if (adev->ip_blocks[i].status.hang &&
4295 0 : adev->ip_blocks[i].version->funcs->soft_reset) {
4296 0 : r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4297 0 : if (r)
4298 : return r;
4299 : }
4300 : }
4301 :
4302 : return 0;
4303 : }
4304 :
4305 : /**
4306 : * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4307 : *
4308 : * @adev: amdgpu_device pointer
4309 : *
4310 : * The list of all the hardware IPs that make up the asic is walked and the
4311 : * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
4312 : * handles any IP specific hardware or software state changes that are
4313 : * necessary after the IP has been soft reset.
4314 : * Returns 0 on success, negative error code on failure.
4315 : */
4316 0 : static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4317 : {
4318 0 : int i, r = 0;
4319 :
4320 0 : for (i = 0; i < adev->num_ip_blocks; i++) {
4321 0 : if (!adev->ip_blocks[i].status.valid)
4322 0 : continue;
4323 0 : if (adev->ip_blocks[i].status.hang &&
4324 0 : adev->ip_blocks[i].version->funcs->post_soft_reset)
4325 0 : r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4326 0 : if (r)
4327 : return r;
4328 : }
4329 :
4330 : return 0;
4331 : }
4332 :
4333 : /**
4334 : * amdgpu_device_recover_vram - Recover some VRAM contents
4335 : *
4336 : * @adev: amdgpu_device pointer
4337 : *
4338 : * Restores the contents of VRAM buffers from the shadows in GTT. Used to
4339 : * restore things like GPUVM page tables after a GPU reset where
4340 : * the contents of VRAM might be lost.
4341 : *
4342 : * Returns:
4343 : * 0 on success, negative error code on failure.
4344 : */
4345 0 : static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4346 : {
4347 0 : struct dma_fence *fence = NULL, *next = NULL;
4348 : struct amdgpu_bo *shadow;
4349 : struct amdgpu_bo_vm *vmbo;
4350 0 : long r = 1, tmo;
4351 :
4352 0 : if (amdgpu_sriov_runtime(adev))
4353 : tmo = msecs_to_jiffies(8000);
4354 : else
4355 0 : tmo = msecs_to_jiffies(100);
4356 :
4357 0 : dev_info(adev->dev, "recover vram bo from shadow start\n");
4358 0 : mutex_lock(&adev->shadow_list_lock);
4359 0 : list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4360 0 : shadow = &vmbo->bo;
4361 : /* No need to recover an evicted BO */
4362 0 : if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4363 0 : shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4364 0 : shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4365 0 : continue;
4366 :
4367 0 : r = amdgpu_bo_restore_shadow(shadow, &next);
4368 0 : if (r)
4369 : break;
4370 :
4371 0 : if (fence) {
4372 0 : tmo = dma_fence_wait_timeout(fence, false, tmo);
4373 0 : dma_fence_put(fence);
4374 0 : fence = next;
4375 0 : if (tmo == 0) {
4376 : r = -ETIMEDOUT;
4377 : break;
4378 0 : } else if (tmo < 0) {
4379 : r = tmo;
4380 : break;
4381 : }
4382 : } else {
4383 0 : fence = next;
4384 : }
4385 : }
4386 0 : mutex_unlock(&adev->shadow_list_lock);
4387 :
4388 0 : if (fence)
4389 0 : tmo = dma_fence_wait_timeout(fence, false, tmo);
4390 0 : dma_fence_put(fence);
4391 :
4392 0 : if (r < 0 || tmo <= 0) {
4393 0 : dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4394 0 : return -EIO;
4395 : }
4396 :
4397 0 : dev_info(adev->dev, "recover vram bo from shadow done\n");
4398 0 : return 0;
4399 : }
4400 :
4401 :
4402 : /**
4403 : * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4404 : *
4405 : * @adev: amdgpu_device pointer
4406 : * @from_hypervisor: request from hypervisor
4407 : *
4408 : * do VF FLR and reinitialize Asic
4409 : * return 0 means succeeded otherwise failed
4410 : */
4411 0 : static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4412 : bool from_hypervisor)
4413 : {
4414 : int r;
4415 0 : struct amdgpu_hive_info *hive = NULL;
4416 0 : int retry_limit = 0;
4417 :
4418 : retry:
4419 0 : amdgpu_amdkfd_pre_reset(adev);
4420 :
4421 0 : if (from_hypervisor)
4422 0 : r = amdgpu_virt_request_full_gpu(adev, true);
4423 : else
4424 0 : r = amdgpu_virt_reset_gpu(adev);
4425 0 : if (r)
4426 : return r;
4427 :
4428 : /* Resume IP prior to SMC */
4429 0 : r = amdgpu_device_ip_reinit_early_sriov(adev);
4430 0 : if (r)
4431 : goto error;
4432 :
4433 0 : amdgpu_virt_init_data_exchange(adev);
4434 :
4435 0 : r = amdgpu_device_fw_loading(adev);
4436 0 : if (r)
4437 : return r;
4438 :
4439 : /* now we are okay to resume SMC/CP/SDMA */
4440 0 : r = amdgpu_device_ip_reinit_late_sriov(adev);
4441 0 : if (r)
4442 : goto error;
4443 :
4444 0 : hive = amdgpu_get_xgmi_hive(adev);
4445 : /* Update PSP FW topology after reset */
4446 0 : if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4447 0 : r = amdgpu_xgmi_update_topology(hive, adev);
4448 :
4449 0 : if (hive)
4450 0 : amdgpu_put_xgmi_hive(hive);
4451 :
4452 0 : if (!r) {
4453 0 : amdgpu_irq_gpu_reset_resume_helper(adev);
4454 0 : r = amdgpu_ib_ring_tests(adev);
4455 :
4456 0 : amdgpu_amdkfd_post_reset(adev);
4457 : }
4458 :
4459 : error:
4460 0 : if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4461 0 : amdgpu_inc_vram_lost(adev);
4462 0 : r = amdgpu_device_recover_vram(adev);
4463 : }
4464 0 : amdgpu_virt_release_full_gpu(adev, true);
4465 :
4466 0 : if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4467 0 : if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4468 0 : retry_limit++;
4469 0 : goto retry;
4470 : } else
4471 0 : DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4472 : }
4473 :
4474 : return r;
4475 : }
4476 :
4477 : /**
4478 : * amdgpu_device_has_job_running - check if there is any job in mirror list
4479 : *
4480 : * @adev: amdgpu_device pointer
4481 : *
4482 : * check if there is any job in mirror list
4483 : */
4484 0 : bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4485 : {
4486 : int i;
4487 : struct drm_sched_job *job;
4488 :
4489 0 : for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4490 0 : struct amdgpu_ring *ring = adev->rings[i];
4491 :
4492 0 : if (!ring || !ring->sched.thread)
4493 0 : continue;
4494 :
4495 0 : spin_lock(&ring->sched.job_list_lock);
4496 0 : job = list_first_entry_or_null(&ring->sched.pending_list,
4497 : struct drm_sched_job, list);
4498 0 : spin_unlock(&ring->sched.job_list_lock);
4499 0 : if (job)
4500 : return true;
4501 : }
4502 : return false;
4503 : }
4504 :
4505 : /**
4506 : * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4507 : *
4508 : * @adev: amdgpu_device pointer
4509 : *
4510 : * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4511 : * a hung GPU.
4512 : */
4513 0 : bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4514 : {
4515 :
4516 0 : if (amdgpu_gpu_recovery == 0)
4517 : goto disabled;
4518 :
4519 0 : if (!amdgpu_device_ip_check_soft_reset(adev)) {
4520 0 : dev_info(adev->dev,"Timeout, but no hardware hang detected.\n");
4521 0 : return false;
4522 : }
4523 :
4524 0 : if (amdgpu_sriov_vf(adev))
4525 : return true;
4526 :
4527 0 : if (amdgpu_gpu_recovery == -1) {
4528 0 : switch (adev->asic_type) {
4529 : #ifdef CONFIG_DRM_AMDGPU_SI
4530 : case CHIP_VERDE:
4531 : case CHIP_TAHITI:
4532 : case CHIP_PITCAIRN:
4533 : case CHIP_OLAND:
4534 : case CHIP_HAINAN:
4535 : #endif
4536 : #ifdef CONFIG_DRM_AMDGPU_CIK
4537 : case CHIP_KAVERI:
4538 : case CHIP_KABINI:
4539 : case CHIP_MULLINS:
4540 : #endif
4541 : case CHIP_CARRIZO:
4542 : case CHIP_STONEY:
4543 : case CHIP_CYAN_SKILLFISH:
4544 : goto disabled;
4545 : default:
4546 : break;
4547 : }
4548 : }
4549 :
4550 : return true;
4551 :
4552 : disabled:
4553 0 : dev_info(adev->dev, "GPU recovery disabled.\n");
4554 0 : return false;
4555 : }
4556 :
4557 0 : int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4558 : {
4559 : u32 i;
4560 0 : int ret = 0;
4561 :
4562 0 : amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4563 :
4564 0 : dev_info(adev->dev, "GPU mode1 reset\n");
4565 :
4566 : /* disable BM */
4567 0 : pci_clear_master(adev->pdev);
4568 :
4569 0 : amdgpu_device_cache_pci_state(adev->pdev);
4570 :
4571 0 : if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4572 0 : dev_info(adev->dev, "GPU smu mode1 reset\n");
4573 0 : ret = amdgpu_dpm_mode1_reset(adev);
4574 : } else {
4575 0 : dev_info(adev->dev, "GPU psp mode1 reset\n");
4576 0 : ret = psp_gpu_reset(adev);
4577 : }
4578 :
4579 0 : if (ret)
4580 0 : dev_err(adev->dev, "GPU mode1 reset failed\n");
4581 :
4582 0 : amdgpu_device_load_pci_state(adev->pdev);
4583 :
4584 : /* wait for asic to come out of reset */
4585 0 : for (i = 0; i < adev->usec_timeout; i++) {
4586 0 : u32 memsize = adev->nbio.funcs->get_memsize(adev);
4587 :
4588 0 : if (memsize != 0xffffffff)
4589 : break;
4590 0 : udelay(1);
4591 : }
4592 :
4593 0 : amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4594 0 : return ret;
4595 : }
4596 :
4597 0 : int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4598 : struct amdgpu_reset_context *reset_context)
4599 : {
4600 0 : int i, r = 0;
4601 0 : struct amdgpu_job *job = NULL;
4602 0 : bool need_full_reset =
4603 0 : test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4604 :
4605 0 : if (reset_context->reset_req_dev == adev)
4606 0 : job = reset_context->job;
4607 :
4608 0 : if (amdgpu_sriov_vf(adev)) {
4609 : /* stop the data exchange thread */
4610 0 : amdgpu_virt_fini_data_exchange(adev);
4611 : }
4612 :
4613 0 : amdgpu_fence_driver_isr_toggle(adev, true);
4614 :
4615 : /* block all schedulers and reset given job's ring */
4616 0 : for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4617 0 : struct amdgpu_ring *ring = adev->rings[i];
4618 :
4619 0 : if (!ring || !ring->sched.thread)
4620 0 : continue;
4621 :
4622 : /*clear job fence from fence drv to avoid force_completion
4623 : *leave NULL and vm flush fence in fence drv */
4624 0 : amdgpu_fence_driver_clear_job_fences(ring);
4625 :
4626 : /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4627 0 : amdgpu_fence_driver_force_completion(ring);
4628 : }
4629 :
4630 0 : amdgpu_fence_driver_isr_toggle(adev, false);
4631 :
4632 0 : if (job && job->vm)
4633 0 : drm_sched_increase_karma(&job->base);
4634 :
4635 0 : r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4636 : /* If reset handler not implemented, continue; otherwise return */
4637 0 : if (r == -ENOSYS)
4638 0 : r = 0;
4639 : else
4640 : return r;
4641 :
4642 : /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4643 0 : if (!amdgpu_sriov_vf(adev)) {
4644 :
4645 0 : if (!need_full_reset)
4646 0 : need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4647 :
4648 0 : if (!need_full_reset && amdgpu_gpu_recovery) {
4649 0 : amdgpu_device_ip_pre_soft_reset(adev);
4650 0 : r = amdgpu_device_ip_soft_reset(adev);
4651 0 : amdgpu_device_ip_post_soft_reset(adev);
4652 0 : if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4653 0 : dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4654 0 : need_full_reset = true;
4655 : }
4656 : }
4657 :
4658 0 : if (need_full_reset)
4659 0 : r = amdgpu_device_ip_suspend(adev);
4660 0 : if (need_full_reset)
4661 0 : set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4662 : else
4663 : clear_bit(AMDGPU_NEED_FULL_RESET,
4664 0 : &reset_context->flags);
4665 : }
4666 :
4667 : return r;
4668 : }
4669 :
4670 0 : static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4671 : {
4672 : int i;
4673 :
4674 : lockdep_assert_held(&adev->reset_domain->sem);
4675 :
4676 0 : for (i = 0; i < adev->num_regs; i++) {
4677 0 : adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
4678 0 : trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
4679 0 : adev->reset_dump_reg_value[i]);
4680 : }
4681 :
4682 0 : return 0;
4683 : }
4684 :
4685 : #ifdef CONFIG_DEV_COREDUMP
4686 : static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
4687 : size_t count, void *data, size_t datalen)
4688 : {
4689 : struct drm_printer p;
4690 : struct amdgpu_device *adev = data;
4691 : struct drm_print_iterator iter;
4692 : int i;
4693 :
4694 : iter.data = buffer;
4695 : iter.offset = 0;
4696 : iter.start = offset;
4697 : iter.remain = count;
4698 :
4699 : p = drm_coredump_printer(&iter);
4700 :
4701 : drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
4702 : drm_printf(&p, "kernel: " UTS_RELEASE "\n");
4703 : drm_printf(&p, "module: " KBUILD_MODNAME "\n");
4704 : drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
4705 : if (adev->reset_task_info.pid)
4706 : drm_printf(&p, "process_name: %s PID: %d\n",
4707 : adev->reset_task_info.process_name,
4708 : adev->reset_task_info.pid);
4709 :
4710 : if (adev->reset_vram_lost)
4711 : drm_printf(&p, "VRAM is lost due to GPU reset!\n");
4712 : if (adev->num_regs) {
4713 : drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
4714 :
4715 : for (i = 0; i < adev->num_regs; i++)
4716 : drm_printf(&p, "0x%08x: 0x%08x\n",
4717 : adev->reset_dump_reg_list[i],
4718 : adev->reset_dump_reg_value[i]);
4719 : }
4720 :
4721 : return count - iter.remain;
4722 : }
4723 :
4724 : static void amdgpu_devcoredump_free(void *data)
4725 : {
4726 : }
4727 :
4728 : static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
4729 : {
4730 : struct drm_device *dev = adev_to_drm(adev);
4731 :
4732 : ktime_get_ts64(&adev->reset_time);
4733 : dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL,
4734 : amdgpu_devcoredump_read, amdgpu_devcoredump_free);
4735 : }
4736 : #endif
4737 :
4738 0 : int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4739 : struct amdgpu_reset_context *reset_context)
4740 : {
4741 0 : struct amdgpu_device *tmp_adev = NULL;
4742 0 : bool need_full_reset, skip_hw_reset, vram_lost = false;
4743 0 : int r = 0;
4744 :
4745 : /* Try reset handler method first */
4746 0 : tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4747 : reset_list);
4748 0 : amdgpu_reset_reg_dumps(tmp_adev);
4749 :
4750 0 : reset_context->reset_device_list = device_list_handle;
4751 0 : r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
4752 : /* If reset handler not implemented, continue; otherwise return */
4753 0 : if (r == -ENOSYS)
4754 0 : r = 0;
4755 : else
4756 : return r;
4757 :
4758 : /* Reset handler not implemented, use the default method */
4759 0 : need_full_reset =
4760 0 : test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4761 0 : skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4762 :
4763 : /*
4764 : * ASIC reset has to be done on all XGMI hive nodes ASAP
4765 : * to allow proper links negotiation in FW (within 1 sec)
4766 : */
4767 0 : if (!skip_hw_reset && need_full_reset) {
4768 0 : list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4769 : /* For XGMI run all resets in parallel to speed up the process */
4770 0 : if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4771 0 : tmp_adev->gmc.xgmi.pending_reset = false;
4772 0 : if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4773 0 : r = -EALREADY;
4774 : } else
4775 0 : r = amdgpu_asic_reset(tmp_adev);
4776 :
4777 0 : if (r) {
4778 0 : dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4779 : r, adev_to_drm(tmp_adev)->unique);
4780 0 : break;
4781 : }
4782 : }
4783 :
4784 : /* For XGMI wait for all resets to complete before proceed */
4785 0 : if (!r) {
4786 0 : list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4787 0 : if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4788 0 : flush_work(&tmp_adev->xgmi_reset_work);
4789 0 : r = tmp_adev->asic_reset_res;
4790 0 : if (r)
4791 : break;
4792 : }
4793 : }
4794 : }
4795 : }
4796 :
4797 0 : if (!r && amdgpu_ras_intr_triggered()) {
4798 0 : list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4799 0 : if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
4800 0 : tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
4801 0 : tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
4802 : }
4803 :
4804 : amdgpu_ras_intr_cleared();
4805 : }
4806 :
4807 0 : list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4808 0 : if (need_full_reset) {
4809 : /* post card */
4810 0 : r = amdgpu_device_asic_init(tmp_adev);
4811 0 : if (r) {
4812 0 : dev_warn(tmp_adev->dev, "asic atom init failed!");
4813 : } else {
4814 0 : dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4815 0 : r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4816 0 : if (r)
4817 : goto out;
4818 :
4819 0 : r = amdgpu_device_ip_resume_phase1(tmp_adev);
4820 0 : if (r)
4821 : goto out;
4822 :
4823 0 : vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4824 : #ifdef CONFIG_DEV_COREDUMP
4825 : tmp_adev->reset_vram_lost = vram_lost;
4826 : memset(&tmp_adev->reset_task_info, 0,
4827 : sizeof(tmp_adev->reset_task_info));
4828 : if (reset_context->job && reset_context->job->vm)
4829 : tmp_adev->reset_task_info =
4830 : reset_context->job->vm->task_info;
4831 : amdgpu_reset_capture_coredumpm(tmp_adev);
4832 : #endif
4833 0 : if (vram_lost) {
4834 0 : DRM_INFO("VRAM is lost due to GPU reset!\n");
4835 0 : amdgpu_inc_vram_lost(tmp_adev);
4836 : }
4837 :
4838 0 : r = amdgpu_device_fw_loading(tmp_adev);
4839 0 : if (r)
4840 : return r;
4841 :
4842 0 : r = amdgpu_device_ip_resume_phase2(tmp_adev);
4843 0 : if (r)
4844 : goto out;
4845 :
4846 0 : if (vram_lost)
4847 : amdgpu_device_fill_reset_magic(tmp_adev);
4848 :
4849 : /*
4850 : * Add this ASIC as tracked as reset was already
4851 : * complete successfully.
4852 : */
4853 0 : amdgpu_register_gpu_instance(tmp_adev);
4854 :
4855 0 : if (!reset_context->hive &&
4856 0 : tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4857 0 : amdgpu_xgmi_add_device(tmp_adev);
4858 :
4859 0 : r = amdgpu_device_ip_late_init(tmp_adev);
4860 0 : if (r)
4861 : goto out;
4862 :
4863 0 : drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
4864 :
4865 : /*
4866 : * The GPU enters bad state once faulty pages
4867 : * by ECC has reached the threshold, and ras
4868 : * recovery is scheduled next. So add one check
4869 : * here to break recovery if it indeed exceeds
4870 : * bad page threshold, and remind user to
4871 : * retire this GPU or setting one bigger
4872 : * bad_page_threshold value to fix this once
4873 : * probing driver again.
4874 : */
4875 0 : if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
4876 : /* must succeed. */
4877 0 : amdgpu_ras_resume(tmp_adev);
4878 : } else {
4879 : r = -EINVAL;
4880 : goto out;
4881 : }
4882 :
4883 : /* Update PSP FW topology after reset */
4884 0 : if (reset_context->hive &&
4885 0 : tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4886 0 : r = amdgpu_xgmi_update_topology(
4887 : reset_context->hive, tmp_adev);
4888 : }
4889 : }
4890 :
4891 : out:
4892 0 : if (!r) {
4893 0 : amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4894 0 : r = amdgpu_ib_ring_tests(tmp_adev);
4895 0 : if (r) {
4896 0 : dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4897 0 : need_full_reset = true;
4898 0 : r = -EAGAIN;
4899 0 : goto end;
4900 : }
4901 : }
4902 :
4903 0 : if (!r)
4904 0 : r = amdgpu_device_recover_vram(tmp_adev);
4905 : else
4906 0 : tmp_adev->asic_reset_res = r;
4907 : }
4908 :
4909 : end:
4910 0 : if (need_full_reset)
4911 0 : set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4912 : else
4913 0 : clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4914 : return r;
4915 : }
4916 :
4917 : static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
4918 : {
4919 :
4920 0 : switch (amdgpu_asic_reset_method(adev)) {
4921 : case AMD_RESET_METHOD_MODE1:
4922 0 : adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4923 : break;
4924 : case AMD_RESET_METHOD_MODE2:
4925 0 : adev->mp1_state = PP_MP1_STATE_RESET;
4926 : break;
4927 : default:
4928 0 : adev->mp1_state = PP_MP1_STATE_NONE;
4929 : break;
4930 : }
4931 : }
4932 :
4933 : static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
4934 : {
4935 0 : amdgpu_vf_error_trans_all(adev);
4936 0 : adev->mp1_state = PP_MP1_STATE_NONE;
4937 : }
4938 :
4939 0 : static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4940 : {
4941 0 : struct pci_dev *p = NULL;
4942 :
4943 0 : p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4944 0 : adev->pdev->bus->number, 1);
4945 0 : if (p) {
4946 0 : pm_runtime_enable(&(p->dev));
4947 0 : pm_runtime_resume(&(p->dev));
4948 : }
4949 0 : }
4950 :
4951 0 : static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4952 : {
4953 : enum amd_reset_method reset_method;
4954 0 : struct pci_dev *p = NULL;
4955 : u64 expires;
4956 :
4957 : /*
4958 : * For now, only BACO and mode1 reset are confirmed
4959 : * to suffer the audio issue without proper suspended.
4960 : */
4961 0 : reset_method = amdgpu_asic_reset_method(adev);
4962 0 : if ((reset_method != AMD_RESET_METHOD_BACO) &&
4963 0 : (reset_method != AMD_RESET_METHOD_MODE1))
4964 : return -EINVAL;
4965 :
4966 0 : p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4967 0 : adev->pdev->bus->number, 1);
4968 0 : if (!p)
4969 : return -ENODEV;
4970 :
4971 0 : expires = pm_runtime_autosuspend_expiration(&(p->dev));
4972 0 : if (!expires)
4973 : /*
4974 : * If we cannot get the audio device autosuspend delay,
4975 : * a fixed 4S interval will be used. Considering 3S is
4976 : * the audio controller default autosuspend delay setting.
4977 : * 4S used here is guaranteed to cover that.
4978 : */
4979 0 : expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4980 :
4981 0 : while (!pm_runtime_status_suspended(&(p->dev))) {
4982 0 : if (!pm_runtime_suspend(&(p->dev)))
4983 : break;
4984 :
4985 0 : if (expires < ktime_get_mono_fast_ns()) {
4986 0 : dev_warn(adev->dev, "failed to suspend display audio\n");
4987 : /* TODO: abort the succeeding gpu reset? */
4988 0 : return -ETIMEDOUT;
4989 : }
4990 : }
4991 :
4992 0 : pm_runtime_disable(&(p->dev));
4993 :
4994 0 : return 0;
4995 : }
4996 :
4997 0 : static void amdgpu_device_recheck_guilty_jobs(
4998 : struct amdgpu_device *adev, struct list_head *device_list_handle,
4999 : struct amdgpu_reset_context *reset_context)
5000 : {
5001 0 : int i, r = 0;
5002 :
5003 0 : for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5004 0 : struct amdgpu_ring *ring = adev->rings[i];
5005 0 : int ret = 0;
5006 : struct drm_sched_job *s_job;
5007 :
5008 0 : if (!ring || !ring->sched.thread)
5009 0 : continue;
5010 :
5011 0 : s_job = list_first_entry_or_null(&ring->sched.pending_list,
5012 : struct drm_sched_job, list);
5013 0 : if (s_job == NULL)
5014 0 : continue;
5015 :
5016 : /* clear job's guilty and depend the folowing step to decide the real one */
5017 0 : drm_sched_reset_karma(s_job);
5018 0 : drm_sched_resubmit_jobs_ext(&ring->sched, 1);
5019 :
5020 0 : if (!s_job->s_fence->parent) {
5021 0 : DRM_WARN("Failed to get a HW fence for job!");
5022 0 : continue;
5023 : }
5024 :
5025 0 : ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
5026 0 : if (ret == 0) { /* timeout */
5027 0 : DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
5028 : ring->sched.name, s_job->id);
5029 :
5030 :
5031 0 : amdgpu_fence_driver_isr_toggle(adev, true);
5032 :
5033 : /* Clear this failed job from fence array */
5034 0 : amdgpu_fence_driver_clear_job_fences(ring);
5035 :
5036 0 : amdgpu_fence_driver_isr_toggle(adev, false);
5037 :
5038 : /* Since the job won't signal and we go for
5039 : * another resubmit drop this parent pointer
5040 : */
5041 0 : dma_fence_put(s_job->s_fence->parent);
5042 0 : s_job->s_fence->parent = NULL;
5043 :
5044 : /* set guilty */
5045 0 : drm_sched_increase_karma(s_job);
5046 0 : amdgpu_reset_prepare_hwcontext(adev, reset_context);
5047 : retry:
5048 : /* do hw reset */
5049 0 : if (amdgpu_sriov_vf(adev)) {
5050 0 : amdgpu_virt_fini_data_exchange(adev);
5051 0 : r = amdgpu_device_reset_sriov(adev, false);
5052 0 : if (r)
5053 0 : adev->asic_reset_res = r;
5054 : } else {
5055 0 : clear_bit(AMDGPU_SKIP_HW_RESET,
5056 0 : &reset_context->flags);
5057 0 : r = amdgpu_do_asic_reset(device_list_handle,
5058 : reset_context);
5059 0 : if (r && r == -EAGAIN)
5060 : goto retry;
5061 : }
5062 :
5063 : /*
5064 : * add reset counter so that the following
5065 : * resubmitted job could flush vmid
5066 : */
5067 0 : atomic_inc(&adev->gpu_reset_counter);
5068 0 : continue;
5069 : }
5070 :
5071 : /* got the hw fence, signal finished fence */
5072 0 : atomic_dec(ring->sched.score);
5073 0 : dma_fence_get(&s_job->s_fence->finished);
5074 0 : dma_fence_signal(&s_job->s_fence->finished);
5075 0 : dma_fence_put(&s_job->s_fence->finished);
5076 :
5077 : /* remove node from list and free the job */
5078 0 : spin_lock(&ring->sched.job_list_lock);
5079 0 : list_del_init(&s_job->list);
5080 0 : spin_unlock(&ring->sched.job_list_lock);
5081 0 : ring->sched.ops->free_job(s_job);
5082 : }
5083 0 : }
5084 :
5085 0 : static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5086 : {
5087 0 : struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5088 :
5089 : #if defined(CONFIG_DEBUG_FS)
5090 : if (!amdgpu_sriov_vf(adev))
5091 : cancel_work(&adev->reset_work);
5092 : #endif
5093 :
5094 0 : if (adev->kfd.dev)
5095 0 : cancel_work(&adev->kfd.reset_work);
5096 :
5097 0 : if (amdgpu_sriov_vf(adev))
5098 0 : cancel_work(&adev->virt.flr_work);
5099 :
5100 0 : if (con && adev->ras_enabled)
5101 0 : cancel_work(&con->recovery_work);
5102 :
5103 0 : }
5104 :
5105 :
5106 : /**
5107 : * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5108 : *
5109 : * @adev: amdgpu_device pointer
5110 : * @job: which job trigger hang
5111 : *
5112 : * Attempt to reset the GPU if it has hung (all asics).
5113 : * Attempt to do soft-reset or full-reset and reinitialize Asic
5114 : * Returns 0 for success or an error on failure.
5115 : */
5116 :
5117 0 : int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5118 : struct amdgpu_job *job,
5119 : struct amdgpu_reset_context *reset_context)
5120 : {
5121 0 : struct list_head device_list, *device_list_handle = NULL;
5122 0 : bool job_signaled = false;
5123 0 : struct amdgpu_hive_info *hive = NULL;
5124 0 : struct amdgpu_device *tmp_adev = NULL;
5125 0 : int i, r = 0;
5126 0 : bool need_emergency_restart = false;
5127 0 : bool audio_suspended = false;
5128 : int tmp_vram_lost_counter;
5129 :
5130 : /*
5131 : * Special case: RAS triggered and full reset isn't supported
5132 : */
5133 0 : need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5134 :
5135 : /*
5136 : * Flush RAM to disk so that after reboot
5137 : * the user can read log and see why the system rebooted.
5138 : */
5139 0 : if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
5140 0 : DRM_WARN("Emergency reboot.");
5141 :
5142 0 : ksys_sync_helper();
5143 0 : emergency_restart();
5144 : }
5145 :
5146 0 : dev_info(adev->dev, "GPU %s begin!\n",
5147 : need_emergency_restart ? "jobs stop":"reset");
5148 :
5149 0 : if (!amdgpu_sriov_vf(adev))
5150 0 : hive = amdgpu_get_xgmi_hive(adev);
5151 0 : if (hive)
5152 0 : mutex_lock(&hive->hive_lock);
5153 :
5154 0 : reset_context->job = job;
5155 0 : reset_context->hive = hive;
5156 :
5157 : /*
5158 : * Build list of devices to reset.
5159 : * In case we are in XGMI hive mode, resort the device list
5160 : * to put adev in the 1st position.
5161 : */
5162 0 : INIT_LIST_HEAD(&device_list);
5163 0 : if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
5164 0 : list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
5165 0 : list_add_tail(&tmp_adev->reset_list, &device_list);
5166 0 : if (!list_is_first(&adev->reset_list, &device_list))
5167 0 : list_rotate_to_front(&adev->reset_list, &device_list);
5168 : device_list_handle = &device_list;
5169 : } else {
5170 0 : list_add_tail(&adev->reset_list, &device_list);
5171 0 : device_list_handle = &device_list;
5172 : }
5173 :
5174 : /* We need to lock reset domain only once both for XGMI and single device */
5175 0 : tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5176 : reset_list);
5177 0 : amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5178 :
5179 : /* block all schedulers and reset given job's ring */
5180 0 : list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5181 :
5182 0 : amdgpu_device_set_mp1_state(tmp_adev);
5183 :
5184 : /*
5185 : * Try to put the audio codec into suspend state
5186 : * before gpu reset started.
5187 : *
5188 : * Due to the power domain of the graphics device
5189 : * is shared with AZ power domain. Without this,
5190 : * we may change the audio hardware from behind
5191 : * the audio driver's back. That will trigger
5192 : * some audio codec errors.
5193 : */
5194 0 : if (!amdgpu_device_suspend_display_audio(tmp_adev))
5195 0 : audio_suspended = true;
5196 :
5197 0 : amdgpu_ras_set_error_query_ready(tmp_adev, false);
5198 :
5199 0 : cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5200 :
5201 0 : if (!amdgpu_sriov_vf(tmp_adev))
5202 0 : amdgpu_amdkfd_pre_reset(tmp_adev);
5203 :
5204 : /*
5205 : * Mark these ASICs to be reseted as untracked first
5206 : * And add them back after reset completed
5207 : */
5208 0 : amdgpu_unregister_gpu_instance(tmp_adev);
5209 :
5210 0 : drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5211 :
5212 : /* disable ras on ALL IPs */
5213 0 : if (!need_emergency_restart &&
5214 0 : amdgpu_device_ip_need_full_reset(tmp_adev))
5215 0 : amdgpu_ras_suspend(tmp_adev);
5216 :
5217 0 : for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5218 0 : struct amdgpu_ring *ring = tmp_adev->rings[i];
5219 :
5220 0 : if (!ring || !ring->sched.thread)
5221 0 : continue;
5222 :
5223 0 : drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5224 :
5225 0 : if (need_emergency_restart)
5226 0 : amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5227 : }
5228 0 : atomic_inc(&tmp_adev->gpu_reset_counter);
5229 : }
5230 :
5231 0 : if (need_emergency_restart)
5232 : goto skip_sched_resume;
5233 :
5234 : /*
5235 : * Must check guilty signal here since after this point all old
5236 : * HW fences are force signaled.
5237 : *
5238 : * job->base holds a reference to parent fence
5239 : */
5240 0 : if (job && dma_fence_is_signaled(&job->hw_fence)) {
5241 0 : job_signaled = true;
5242 0 : dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5243 0 : goto skip_hw_reset;
5244 : }
5245 :
5246 : retry: /* Rest of adevs pre asic reset from XGMI hive. */
5247 0 : list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5248 0 : r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5249 : /*TODO Should we stop ?*/
5250 0 : if (r) {
5251 0 : dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5252 : r, adev_to_drm(tmp_adev)->unique);
5253 0 : tmp_adev->asic_reset_res = r;
5254 : }
5255 :
5256 : /*
5257 : * Drop all pending non scheduler resets. Scheduler resets
5258 : * were already dropped during drm_sched_stop
5259 : */
5260 0 : amdgpu_device_stop_pending_resets(tmp_adev);
5261 : }
5262 :
5263 0 : tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
5264 : /* Actual ASIC resets if needed.*/
5265 : /* Host driver will handle XGMI hive reset for SRIOV */
5266 0 : if (amdgpu_sriov_vf(adev)) {
5267 0 : r = amdgpu_device_reset_sriov(adev, job ? false : true);
5268 0 : if (r)
5269 0 : adev->asic_reset_res = r;
5270 :
5271 : /* Aldebaran supports ras in SRIOV, so need resume ras during reset */
5272 0 : if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
5273 0 : amdgpu_ras_resume(adev);
5274 : } else {
5275 0 : r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5276 0 : if (r && r == -EAGAIN) {
5277 0 : set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags);
5278 0 : adev->asic_reset_res = 0;
5279 0 : goto retry;
5280 : }
5281 : }
5282 :
5283 : skip_hw_reset:
5284 :
5285 : /* Post ASIC reset for all devs .*/
5286 0 : list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5287 :
5288 : /*
5289 : * Sometimes a later bad compute job can block a good gfx job as gfx
5290 : * and compute ring share internal GC HW mutually. We add an additional
5291 : * guilty jobs recheck step to find the real guilty job, it synchronously
5292 : * submits and pends for the first job being signaled. If it gets timeout,
5293 : * we identify it as a real guilty job.
5294 : */
5295 0 : if (amdgpu_gpu_recovery == 2 &&
5296 0 : !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
5297 0 : amdgpu_device_recheck_guilty_jobs(
5298 : tmp_adev, device_list_handle, reset_context);
5299 :
5300 0 : for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5301 0 : struct amdgpu_ring *ring = tmp_adev->rings[i];
5302 :
5303 0 : if (!ring || !ring->sched.thread)
5304 0 : continue;
5305 :
5306 : /* No point to resubmit jobs if we didn't HW reset*/
5307 0 : if (!tmp_adev->asic_reset_res && !job_signaled)
5308 0 : drm_sched_resubmit_jobs(&ring->sched);
5309 :
5310 0 : drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
5311 : }
5312 :
5313 0 : if (adev->enable_mes)
5314 0 : amdgpu_mes_self_test(tmp_adev);
5315 :
5316 0 : if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) {
5317 0 : drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5318 : }
5319 :
5320 0 : if (tmp_adev->asic_reset_res)
5321 0 : r = tmp_adev->asic_reset_res;
5322 :
5323 0 : tmp_adev->asic_reset_res = 0;
5324 :
5325 0 : if (r) {
5326 : /* bad news, how to tell it to userspace ? */
5327 0 : dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5328 0 : amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5329 : } else {
5330 0 : dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5331 0 : if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5332 : DRM_WARN("smart shift update failed\n");
5333 : }
5334 : }
5335 :
5336 : skip_sched_resume:
5337 0 : list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5338 : /* unlock kfd: SRIOV would do it separately */
5339 0 : if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5340 0 : amdgpu_amdkfd_post_reset(tmp_adev);
5341 :
5342 : /* kfd_post_reset will do nothing if kfd device is not initialized,
5343 : * need to bring up kfd here if it's not be initialized before
5344 : */
5345 0 : if (!adev->kfd.init_complete)
5346 0 : amdgpu_amdkfd_device_init(adev);
5347 :
5348 0 : if (audio_suspended)
5349 0 : amdgpu_device_resume_display_audio(tmp_adev);
5350 :
5351 0 : amdgpu_device_unset_mp1_state(tmp_adev);
5352 : }
5353 :
5354 0 : tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5355 : reset_list);
5356 0 : amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5357 :
5358 0 : if (hive) {
5359 0 : mutex_unlock(&hive->hive_lock);
5360 0 : amdgpu_put_xgmi_hive(hive);
5361 : }
5362 :
5363 0 : if (r)
5364 0 : dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5365 :
5366 0 : atomic_set(&adev->reset_domain->reset_res, r);
5367 0 : return r;
5368 : }
5369 :
5370 : /**
5371 : * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5372 : *
5373 : * @adev: amdgpu_device pointer
5374 : *
5375 : * Fetchs and stores in the driver the PCIE capabilities (gen speed
5376 : * and lanes) of the slot the device is in. Handles APUs and
5377 : * virtualized environments where PCIE config space may not be available.
5378 : */
5379 0 : static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5380 : {
5381 : struct pci_dev *pdev;
5382 : enum pci_bus_speed speed_cap, platform_speed_cap;
5383 : enum pcie_link_width platform_link_width;
5384 :
5385 0 : if (amdgpu_pcie_gen_cap)
5386 0 : adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5387 :
5388 0 : if (amdgpu_pcie_lane_cap)
5389 0 : adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5390 :
5391 : /* covers APUs as well */
5392 0 : if (pci_is_root_bus(adev->pdev->bus)) {
5393 0 : if (adev->pm.pcie_gen_mask == 0)
5394 0 : adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5395 0 : if (adev->pm.pcie_mlw_mask == 0)
5396 0 : adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5397 0 : return;
5398 : }
5399 :
5400 0 : if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5401 : return;
5402 :
5403 0 : pcie_bandwidth_available(adev->pdev, NULL,
5404 : &platform_speed_cap, &platform_link_width);
5405 :
5406 0 : if (adev->pm.pcie_gen_mask == 0) {
5407 : /* asic caps */
5408 0 : pdev = adev->pdev;
5409 0 : speed_cap = pcie_get_speed_cap(pdev);
5410 0 : if (speed_cap == PCI_SPEED_UNKNOWN) {
5411 0 : adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5412 : CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5413 : CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5414 : } else {
5415 0 : if (speed_cap == PCIE_SPEED_32_0GT)
5416 0 : adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5417 : CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5418 : CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5419 : CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5420 : CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5421 0 : else if (speed_cap == PCIE_SPEED_16_0GT)
5422 0 : adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5423 : CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5424 : CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5425 : CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5426 0 : else if (speed_cap == PCIE_SPEED_8_0GT)
5427 0 : adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5428 : CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5429 : CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5430 0 : else if (speed_cap == PCIE_SPEED_5_0GT)
5431 0 : adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5432 : CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5433 : else
5434 0 : adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5435 : }
5436 : /* platform caps */
5437 0 : if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5438 0 : adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5439 : CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5440 : } else {
5441 0 : if (platform_speed_cap == PCIE_SPEED_32_0GT)
5442 0 : adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5443 : CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5444 : CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5445 : CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5446 : CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5447 0 : else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5448 0 : adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5449 : CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5450 : CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5451 : CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5452 0 : else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5453 0 : adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5454 : CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5455 : CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5456 0 : else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5457 0 : adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5458 : CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5459 : else
5460 0 : adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5461 :
5462 : }
5463 : }
5464 0 : if (adev->pm.pcie_mlw_mask == 0) {
5465 0 : if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5466 0 : adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5467 : } else {
5468 0 : switch (platform_link_width) {
5469 : case PCIE_LNK_X32:
5470 0 : adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5471 : CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5472 : CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5473 : CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5474 : CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5475 : CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5476 : CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5477 0 : break;
5478 : case PCIE_LNK_X16:
5479 0 : adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5480 : CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5481 : CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5482 : CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5483 : CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5484 : CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5485 0 : break;
5486 : case PCIE_LNK_X12:
5487 0 : adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5488 : CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5489 : CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5490 : CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5491 : CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5492 0 : break;
5493 : case PCIE_LNK_X8:
5494 0 : adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5495 : CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5496 : CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5497 : CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5498 0 : break;
5499 : case PCIE_LNK_X4:
5500 0 : adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5501 : CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5502 : CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5503 0 : break;
5504 : case PCIE_LNK_X2:
5505 0 : adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5506 : CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5507 0 : break;
5508 : case PCIE_LNK_X1:
5509 0 : adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5510 0 : break;
5511 : default:
5512 : break;
5513 : }
5514 : }
5515 : }
5516 : }
5517 :
5518 : /**
5519 : * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5520 : *
5521 : * @adev: amdgpu_device pointer
5522 : * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5523 : *
5524 : * Return true if @peer_adev can access (DMA) @adev through the PCIe
5525 : * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5526 : * @peer_adev.
5527 : */
5528 0 : bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5529 : struct amdgpu_device *peer_adev)
5530 : {
5531 : #ifdef CONFIG_HSA_AMD_P2P
5532 : uint64_t address_mask = peer_adev->dev->dma_mask ?
5533 : ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5534 : resource_size_t aper_limit =
5535 : adev->gmc.aper_base + adev->gmc.aper_size - 1;
5536 : bool p2p_access = !adev->gmc.xgmi.connected_to_cpu &&
5537 : !(pci_p2pdma_distance_many(adev->pdev,
5538 : &peer_adev->dev, 1, true) < 0);
5539 :
5540 : return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5541 : adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5542 : !(adev->gmc.aper_base & address_mask ||
5543 : aper_limit & address_mask));
5544 : #else
5545 0 : return false;
5546 : #endif
5547 : }
5548 :
5549 0 : int amdgpu_device_baco_enter(struct drm_device *dev)
5550 : {
5551 0 : struct amdgpu_device *adev = drm_to_adev(dev);
5552 0 : struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5553 :
5554 0 : if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5555 : return -ENOTSUPP;
5556 :
5557 0 : if (ras && adev->ras_enabled &&
5558 0 : adev->nbio.funcs->enable_doorbell_interrupt)
5559 0 : adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5560 :
5561 0 : return amdgpu_dpm_baco_enter(adev);
5562 : }
5563 :
5564 0 : int amdgpu_device_baco_exit(struct drm_device *dev)
5565 : {
5566 0 : struct amdgpu_device *adev = drm_to_adev(dev);
5567 0 : struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5568 0 : int ret = 0;
5569 :
5570 0 : if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5571 : return -ENOTSUPP;
5572 :
5573 0 : ret = amdgpu_dpm_baco_exit(adev);
5574 0 : if (ret)
5575 : return ret;
5576 :
5577 0 : if (ras && adev->ras_enabled &&
5578 0 : adev->nbio.funcs->enable_doorbell_interrupt)
5579 0 : adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5580 :
5581 0 : if (amdgpu_passthrough(adev) &&
5582 0 : adev->nbio.funcs->clear_doorbell_interrupt)
5583 0 : adev->nbio.funcs->clear_doorbell_interrupt(adev);
5584 :
5585 : return 0;
5586 : }
5587 :
5588 : /**
5589 : * amdgpu_pci_error_detected - Called when a PCI error is detected.
5590 : * @pdev: PCI device struct
5591 : * @state: PCI channel state
5592 : *
5593 : * Description: Called when a PCI error is detected.
5594 : *
5595 : * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5596 : */
5597 0 : pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5598 : {
5599 0 : struct drm_device *dev = pci_get_drvdata(pdev);
5600 0 : struct amdgpu_device *adev = drm_to_adev(dev);
5601 : int i;
5602 :
5603 0 : DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5604 :
5605 0 : if (adev->gmc.xgmi.num_physical_nodes > 1) {
5606 0 : DRM_WARN("No support for XGMI hive yet...");
5607 0 : return PCI_ERS_RESULT_DISCONNECT;
5608 : }
5609 :
5610 0 : adev->pci_channel_state = state;
5611 :
5612 0 : switch (state) {
5613 : case pci_channel_io_normal:
5614 : return PCI_ERS_RESULT_CAN_RECOVER;
5615 : /* Fatal error, prepare for slot reset */
5616 : case pci_channel_io_frozen:
5617 : /*
5618 : * Locking adev->reset_domain->sem will prevent any external access
5619 : * to GPU during PCI error recovery
5620 : */
5621 0 : amdgpu_device_lock_reset_domain(adev->reset_domain);
5622 : amdgpu_device_set_mp1_state(adev);
5623 :
5624 : /*
5625 : * Block any work scheduling as we do for regular GPU reset
5626 : * for the duration of the recovery
5627 : */
5628 0 : for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5629 0 : struct amdgpu_ring *ring = adev->rings[i];
5630 :
5631 0 : if (!ring || !ring->sched.thread)
5632 0 : continue;
5633 :
5634 0 : drm_sched_stop(&ring->sched, NULL);
5635 : }
5636 0 : atomic_inc(&adev->gpu_reset_counter);
5637 0 : return PCI_ERS_RESULT_NEED_RESET;
5638 : case pci_channel_io_perm_failure:
5639 : /* Permanent error, prepare for device removal */
5640 0 : return PCI_ERS_RESULT_DISCONNECT;
5641 : }
5642 :
5643 0 : return PCI_ERS_RESULT_NEED_RESET;
5644 : }
5645 :
5646 : /**
5647 : * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5648 : * @pdev: pointer to PCI device
5649 : */
5650 0 : pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5651 : {
5652 :
5653 0 : DRM_INFO("PCI error: mmio enabled callback!!\n");
5654 :
5655 : /* TODO - dump whatever for debugging purposes */
5656 :
5657 : /* This called only if amdgpu_pci_error_detected returns
5658 : * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5659 : * works, no need to reset slot.
5660 : */
5661 :
5662 0 : return PCI_ERS_RESULT_RECOVERED;
5663 : }
5664 :
5665 : /**
5666 : * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5667 : * @pdev: PCI device struct
5668 : *
5669 : * Description: This routine is called by the pci error recovery
5670 : * code after the PCI slot has been reset, just before we
5671 : * should resume normal operations.
5672 : */
5673 0 : pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5674 : {
5675 0 : struct drm_device *dev = pci_get_drvdata(pdev);
5676 0 : struct amdgpu_device *adev = drm_to_adev(dev);
5677 : int r, i;
5678 : struct amdgpu_reset_context reset_context;
5679 : u32 memsize;
5680 : struct list_head device_list;
5681 :
5682 0 : DRM_INFO("PCI error: slot reset callback!!\n");
5683 :
5684 0 : memset(&reset_context, 0, sizeof(reset_context));
5685 :
5686 0 : INIT_LIST_HEAD(&device_list);
5687 0 : list_add_tail(&adev->reset_list, &device_list);
5688 :
5689 : /* wait for asic to come out of reset */
5690 0 : msleep(500);
5691 :
5692 : /* Restore PCI confspace */
5693 0 : amdgpu_device_load_pci_state(pdev);
5694 :
5695 : /* confirm ASIC came out of reset */
5696 0 : for (i = 0; i < adev->usec_timeout; i++) {
5697 0 : memsize = amdgpu_asic_get_config_memsize(adev);
5698 :
5699 0 : if (memsize != 0xffffffff)
5700 : break;
5701 0 : udelay(1);
5702 : }
5703 0 : if (memsize == 0xffffffff) {
5704 : r = -ETIME;
5705 : goto out;
5706 : }
5707 :
5708 0 : reset_context.method = AMD_RESET_METHOD_NONE;
5709 0 : reset_context.reset_req_dev = adev;
5710 0 : set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5711 0 : set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5712 0 : set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
5713 :
5714 0 : adev->no_hw_access = true;
5715 0 : r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5716 0 : adev->no_hw_access = false;
5717 0 : if (r)
5718 : goto out;
5719 :
5720 0 : r = amdgpu_do_asic_reset(&device_list, &reset_context);
5721 :
5722 : out:
5723 0 : if (!r) {
5724 0 : if (amdgpu_device_cache_pci_state(adev->pdev))
5725 0 : pci_restore_state(adev->pdev);
5726 :
5727 0 : DRM_INFO("PCIe error recovery succeeded\n");
5728 : } else {
5729 0 : DRM_ERROR("PCIe error recovery failed, err:%d", r);
5730 0 : amdgpu_device_unset_mp1_state(adev);
5731 0 : amdgpu_device_unlock_reset_domain(adev->reset_domain);
5732 : }
5733 :
5734 0 : return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5735 : }
5736 :
5737 : /**
5738 : * amdgpu_pci_resume() - resume normal ops after PCI reset
5739 : * @pdev: pointer to PCI device
5740 : *
5741 : * Called when the error recovery driver tells us that its
5742 : * OK to resume normal operation.
5743 : */
5744 0 : void amdgpu_pci_resume(struct pci_dev *pdev)
5745 : {
5746 0 : struct drm_device *dev = pci_get_drvdata(pdev);
5747 0 : struct amdgpu_device *adev = drm_to_adev(dev);
5748 : int i;
5749 :
5750 :
5751 0 : DRM_INFO("PCI error: resume callback!!\n");
5752 :
5753 : /* Only continue execution for the case of pci_channel_io_frozen */
5754 0 : if (adev->pci_channel_state != pci_channel_io_frozen)
5755 : return;
5756 :
5757 0 : for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5758 0 : struct amdgpu_ring *ring = adev->rings[i];
5759 :
5760 0 : if (!ring || !ring->sched.thread)
5761 0 : continue;
5762 :
5763 :
5764 0 : drm_sched_resubmit_jobs(&ring->sched);
5765 0 : drm_sched_start(&ring->sched, true);
5766 : }
5767 :
5768 0 : amdgpu_device_unset_mp1_state(adev);
5769 0 : amdgpu_device_unlock_reset_domain(adev->reset_domain);
5770 : }
5771 :
5772 0 : bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5773 : {
5774 0 : struct drm_device *dev = pci_get_drvdata(pdev);
5775 0 : struct amdgpu_device *adev = drm_to_adev(dev);
5776 : int r;
5777 :
5778 0 : r = pci_save_state(pdev);
5779 0 : if (!r) {
5780 0 : kfree(adev->pci_state);
5781 :
5782 0 : adev->pci_state = pci_store_saved_state(pdev);
5783 :
5784 0 : if (!adev->pci_state) {
5785 0 : DRM_ERROR("Failed to store PCI saved state");
5786 0 : return false;
5787 : }
5788 : } else {
5789 0 : DRM_WARN("Failed to save PCI state, err:%d\n", r);
5790 0 : return false;
5791 : }
5792 :
5793 : return true;
5794 : }
5795 :
5796 0 : bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5797 : {
5798 0 : struct drm_device *dev = pci_get_drvdata(pdev);
5799 0 : struct amdgpu_device *adev = drm_to_adev(dev);
5800 : int r;
5801 :
5802 0 : if (!adev->pci_state)
5803 : return false;
5804 :
5805 0 : r = pci_load_saved_state(pdev, adev->pci_state);
5806 :
5807 0 : if (!r) {
5808 0 : pci_restore_state(pdev);
5809 : } else {
5810 0 : DRM_WARN("Failed to load PCI state, err:%d\n", r);
5811 0 : return false;
5812 : }
5813 :
5814 0 : return true;
5815 : }
5816 :
5817 0 : void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5818 : struct amdgpu_ring *ring)
5819 : {
5820 : #ifdef CONFIG_X86_64
5821 0 : if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5822 : return;
5823 : #endif
5824 0 : if (adev->gmc.xgmi.connected_to_cpu)
5825 : return;
5826 :
5827 0 : if (ring && ring->funcs->emit_hdp_flush)
5828 0 : amdgpu_ring_emit_hdp_flush(ring);
5829 : else
5830 0 : amdgpu_asic_flush_hdp(adev, ring);
5831 : }
5832 :
5833 0 : void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5834 : struct amdgpu_ring *ring)
5835 : {
5836 : #ifdef CONFIG_X86_64
5837 0 : if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5838 : return;
5839 : #endif
5840 0 : if (adev->gmc.xgmi.connected_to_cpu)
5841 : return;
5842 :
5843 0 : amdgpu_asic_invalidate_hdp(adev, ring);
5844 : }
5845 :
5846 0 : int amdgpu_in_reset(struct amdgpu_device *adev)
5847 : {
5848 0 : return atomic_read(&adev->reset_domain->in_gpu_reset);
5849 : }
5850 :
5851 : /**
5852 : * amdgpu_device_halt() - bring hardware to some kind of halt state
5853 : *
5854 : * @adev: amdgpu_device pointer
5855 : *
5856 : * Bring hardware to some kind of halt state so that no one can touch it
5857 : * any more. It will help to maintain error context when error occurred.
5858 : * Compare to a simple hang, the system will keep stable at least for SSH
5859 : * access. Then it should be trivial to inspect the hardware state and
5860 : * see what's going on. Implemented as following:
5861 : *
5862 : * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
5863 : * clears all CPU mappings to device, disallows remappings through page faults
5864 : * 2. amdgpu_irq_disable_all() disables all interrupts
5865 : * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
5866 : * 4. set adev->no_hw_access to avoid potential crashes after setp 5
5867 : * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
5868 : * 6. pci_disable_device() and pci_wait_for_pending_transaction()
5869 : * flush any in flight DMA operations
5870 : */
5871 0 : void amdgpu_device_halt(struct amdgpu_device *adev)
5872 : {
5873 0 : struct pci_dev *pdev = adev->pdev;
5874 0 : struct drm_device *ddev = adev_to_drm(adev);
5875 :
5876 0 : drm_dev_unplug(ddev);
5877 :
5878 0 : amdgpu_irq_disable_all(adev);
5879 :
5880 0 : amdgpu_fence_driver_hw_fini(adev);
5881 :
5882 0 : adev->no_hw_access = true;
5883 :
5884 0 : amdgpu_device_unmap_mmio(adev);
5885 :
5886 0 : pci_disable_device(pdev);
5887 0 : pci_wait_for_pending_transaction(pdev);
5888 0 : }
5889 :
5890 0 : u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
5891 : u32 reg)
5892 : {
5893 : unsigned long flags, address, data;
5894 : u32 r;
5895 :
5896 0 : address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5897 0 : data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5898 :
5899 0 : spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5900 0 : WREG32(address, reg * 4);
5901 0 : (void)RREG32(address);
5902 0 : r = RREG32(data);
5903 0 : spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5904 0 : return r;
5905 : }
5906 :
5907 0 : void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
5908 : u32 reg, u32 v)
5909 : {
5910 : unsigned long flags, address, data;
5911 :
5912 0 : address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5913 0 : data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5914 :
5915 0 : spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5916 0 : WREG32(address, reg * 4);
5917 0 : (void)RREG32(address);
5918 0 : WREG32(data, v);
5919 0 : (void)RREG32(data);
5920 0 : spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5921 0 : }
|