LCOV - code coverage report
Current view: top level - drivers/gpu/drm/amd/amdgpu - amdgpu_ras.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 0 878 0.0 %
Date: 2022-12-09 01:23:36 Functions: 0 67 0.0 %

          Line data    Source code
       1             : /*
       2             :  * Copyright 2018 Advanced Micro Devices, Inc.
       3             :  *
       4             :  * Permission is hereby granted, free of charge, to any person obtaining a
       5             :  * copy of this software and associated documentation files (the "Software"),
       6             :  * to deal in the Software without restriction, including without limitation
       7             :  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
       8             :  * and/or sell copies of the Software, and to permit persons to whom the
       9             :  * Software is furnished to do so, subject to the following conditions:
      10             :  *
      11             :  * The above copyright notice and this permission notice shall be included in
      12             :  * all copies or substantial portions of the Software.
      13             :  *
      14             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
      15             :  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      16             :  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
      17             :  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
      18             :  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
      19             :  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
      20             :  * OTHER DEALINGS IN THE SOFTWARE.
      21             :  *
      22             :  *
      23             :  */
      24             : #include <linux/debugfs.h>
      25             : #include <linux/list.h>
      26             : #include <linux/module.h>
      27             : #include <linux/uaccess.h>
      28             : #include <linux/reboot.h>
      29             : #include <linux/syscalls.h>
      30             : #include <linux/pm_runtime.h>
      31             : 
      32             : #include "amdgpu.h"
      33             : #include "amdgpu_ras.h"
      34             : #include "amdgpu_atomfirmware.h"
      35             : #include "amdgpu_xgmi.h"
      36             : #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
      37             : #include "atom.h"
      38             : #include "amdgpu_reset.h"
      39             : 
      40             : #ifdef CONFIG_X86_MCE_AMD
      41             : #include <asm/mce.h>
      42             : 
      43             : static bool notifier_registered;
      44             : #endif
      45             : static const char *RAS_FS_NAME = "ras";
      46             : 
      47             : const char *ras_error_string[] = {
      48             :         "none",
      49             :         "parity",
      50             :         "single_correctable",
      51             :         "multi_uncorrectable",
      52             :         "poison",
      53             : };
      54             : 
      55             : const char *ras_block_string[] = {
      56             :         "umc",
      57             :         "sdma",
      58             :         "gfx",
      59             :         "mmhub",
      60             :         "athub",
      61             :         "pcie_bif",
      62             :         "hdp",
      63             :         "xgmi_wafl",
      64             :         "df",
      65             :         "smn",
      66             :         "sem",
      67             :         "mp0",
      68             :         "mp1",
      69             :         "fuse",
      70             :         "mca",
      71             :         "vcn",
      72             :         "jpeg",
      73             : };
      74             : 
      75             : const char *ras_mca_block_string[] = {
      76             :         "mca_mp0",
      77             :         "mca_mp1",
      78             :         "mca_mpio",
      79             :         "mca_iohc",
      80             : };
      81             : 
      82             : struct amdgpu_ras_block_list {
      83             :         /* ras block link */
      84             :         struct list_head node;
      85             : 
      86             :         struct amdgpu_ras_block_object *ras_obj;
      87             : };
      88             : 
      89           0 : const char *get_ras_block_str(struct ras_common_if *ras_block)
      90             : {
      91           0 :         if (!ras_block)
      92             :                 return "NULL";
      93             : 
      94           0 :         if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT)
      95             :                 return "OUT OF RANGE";
      96             : 
      97           0 :         if (ras_block->block == AMDGPU_RAS_BLOCK__MCA)
      98           0 :                 return ras_mca_block_string[ras_block->sub_block_index];
      99             : 
     100           0 :         return ras_block_string[ras_block->block];
     101             : }
     102             : 
     103             : #define ras_block_str(_BLOCK_) \
     104             :         (((_BLOCK_) < ARRAY_SIZE(ras_block_string)) ? ras_block_string[_BLOCK_] : "Out Of Range")
     105             : 
     106             : #define ras_err_str(i) (ras_error_string[ffs(i)])
     107             : 
     108             : #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
     109             : 
     110             : /* inject address is 52 bits */
     111             : #define RAS_UMC_INJECT_ADDR_LIMIT       (0x1ULL << 52)
     112             : 
     113             : /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
     114             : #define RAS_BAD_PAGE_COVER              (100 * 1024 * 1024ULL)
     115             : 
     116             : enum amdgpu_ras_retire_page_reservation {
     117             :         AMDGPU_RAS_RETIRE_PAGE_RESERVED,
     118             :         AMDGPU_RAS_RETIRE_PAGE_PENDING,
     119             :         AMDGPU_RAS_RETIRE_PAGE_FAULT,
     120             : };
     121             : 
     122             : atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
     123             : 
     124             : static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
     125             :                                 uint64_t addr);
     126             : static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
     127             :                                 uint64_t addr);
     128             : #ifdef CONFIG_X86_MCE_AMD
     129             : static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
     130             : struct mce_notifier_adev_list {
     131             :         struct amdgpu_device *devs[MAX_GPU_INSTANCE];
     132             :         int num_gpu;
     133             : };
     134             : static struct mce_notifier_adev_list mce_adev_list;
     135             : #endif
     136             : 
     137           0 : void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
     138             : {
     139           0 :         if (adev && amdgpu_ras_get_context(adev))
     140           0 :                 amdgpu_ras_get_context(adev)->error_query_ready = ready;
     141           0 : }
     142             : 
     143             : static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
     144             : {
     145           0 :         if (adev && amdgpu_ras_get_context(adev))
     146           0 :                 return amdgpu_ras_get_context(adev)->error_query_ready;
     147             : 
     148             :         return false;
     149             : }
     150             : 
     151             : static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address)
     152             : {
     153             :         struct ras_err_data err_data = {0, 0, 0, NULL};
     154             :         struct eeprom_table_record err_rec;
     155             : 
     156             :         if ((address >= adev->gmc.mc_vram_size) ||
     157             :             (address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
     158             :                 dev_warn(adev->dev,
     159             :                          "RAS WARN: input address 0x%llx is invalid.\n",
     160             :                          address);
     161             :                 return -EINVAL;
     162             :         }
     163             : 
     164             :         if (amdgpu_ras_check_bad_page(adev, address)) {
     165             :                 dev_warn(adev->dev,
     166             :                          "RAS WARN: 0x%llx has already been marked as bad page!\n",
     167             :                          address);
     168             :                 return 0;
     169             :         }
     170             : 
     171             :         memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
     172             :         err_data.err_addr = &err_rec;
     173             :         amdgpu_umc_fill_error_record(&err_data, address,
     174             :                         (address >> AMDGPU_GPU_PAGE_SHIFT), 0, 0);
     175             : 
     176             :         if (amdgpu_bad_page_threshold != 0) {
     177             :                 amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
     178             :                                          err_data.err_addr_cnt);
     179             :                 amdgpu_ras_save_bad_pages(adev);
     180             :         }
     181             : 
     182             :         dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
     183             :         dev_warn(adev->dev, "Clear EEPROM:\n");
     184             :         dev_warn(adev->dev, "    echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n");
     185             : 
     186             :         return 0;
     187             : }
     188             : 
     189             : static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
     190             :                                         size_t size, loff_t *pos)
     191             : {
     192             :         struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
     193             :         struct ras_query_if info = {
     194             :                 .head = obj->head,
     195             :         };
     196             :         ssize_t s;
     197             :         char val[128];
     198             : 
     199             :         if (amdgpu_ras_query_error_status(obj->adev, &info))
     200             :                 return -EINVAL;
     201             : 
     202             :         /* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */
     203             :         if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
     204             :             obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
     205             :                 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
     206             :                         dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
     207             :         }
     208             : 
     209             :         s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
     210             :                         "ue", info.ue_count,
     211             :                         "ce", info.ce_count);
     212             :         if (*pos >= s)
     213             :                 return 0;
     214             : 
     215             :         s -= *pos;
     216             :         s = min_t(u64, s, size);
     217             : 
     218             : 
     219             :         if (copy_to_user(buf, &val[*pos], s))
     220             :                 return -EINVAL;
     221             : 
     222             :         *pos += s;
     223             : 
     224             :         return s;
     225             : }
     226             : 
     227             : static const struct file_operations amdgpu_ras_debugfs_ops = {
     228             :         .owner = THIS_MODULE,
     229             :         .read = amdgpu_ras_debugfs_read,
     230             :         .write = NULL,
     231             :         .llseek = default_llseek
     232             : };
     233             : 
     234             : static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
     235             : {
     236             :         int i;
     237             : 
     238             :         for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
     239             :                 *block_id = i;
     240             :                 if (strcmp(name, ras_block_string[i]) == 0)
     241             :                         return 0;
     242             :         }
     243             :         return -EINVAL;
     244             : }
     245             : 
     246             : static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
     247             :                 const char __user *buf, size_t size,
     248             :                 loff_t *pos, struct ras_debug_if *data)
     249             : {
     250             :         ssize_t s = min_t(u64, 64, size);
     251             :         char str[65];
     252             :         char block_name[33];
     253             :         char err[9] = "ue";
     254             :         int op = -1;
     255             :         int block_id;
     256             :         uint32_t sub_block;
     257             :         u64 address, value;
     258             : 
     259             :         if (*pos)
     260             :                 return -EINVAL;
     261             :         *pos = size;
     262             : 
     263             :         memset(str, 0, sizeof(str));
     264             :         memset(data, 0, sizeof(*data));
     265             : 
     266             :         if (copy_from_user(str, buf, s))
     267             :                 return -EINVAL;
     268             : 
     269             :         if (sscanf(str, "disable %32s", block_name) == 1)
     270             :                 op = 0;
     271             :         else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
     272             :                 op = 1;
     273             :         else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
     274             :                 op = 2;
     275             :         else if (strstr(str, "retire_page") != NULL)
     276             :                 op = 3;
     277             :         else if (str[0] && str[1] && str[2] && str[3])
     278             :                 /* ascii string, but commands are not matched. */
     279             :                 return -EINVAL;
     280             : 
     281             :         if (op != -1) {
     282             :                 if (op == 3) {
     283             :                         if (sscanf(str, "%*s 0x%llx", &address) != 1 &&
     284             :                             sscanf(str, "%*s %llu", &address) != 1)
     285             :                                 return -EINVAL;
     286             : 
     287             :                         data->op = op;
     288             :                         data->inject.address = address;
     289             : 
     290             :                         return 0;
     291             :                 }
     292             : 
     293             :                 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
     294             :                         return -EINVAL;
     295             : 
     296             :                 data->head.block = block_id;
     297             :                 /* only ue and ce errors are supported */
     298             :                 if (!memcmp("ue", err, 2))
     299             :                         data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
     300             :                 else if (!memcmp("ce", err, 2))
     301             :                         data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
     302             :                 else
     303             :                         return -EINVAL;
     304             : 
     305             :                 data->op = op;
     306             : 
     307             :                 if (op == 2) {
     308             :                         if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
     309             :                                    &sub_block, &address, &value) != 3 &&
     310             :                             sscanf(str, "%*s %*s %*s %u %llu %llu",
     311             :                                    &sub_block, &address, &value) != 3)
     312             :                                 return -EINVAL;
     313             :                         data->head.sub_block_index = sub_block;
     314             :                         data->inject.address = address;
     315             :                         data->inject.value = value;
     316             :                 }
     317             :         } else {
     318             :                 if (size < sizeof(*data))
     319             :                         return -EINVAL;
     320             : 
     321             :                 if (copy_from_user(data, buf, sizeof(*data)))
     322             :                         return -EINVAL;
     323             :         }
     324             : 
     325             :         return 0;
     326             : }
     327             : 
     328             : /**
     329             :  * DOC: AMDGPU RAS debugfs control interface
     330             :  *
     331             :  * The control interface accepts struct ras_debug_if which has two members.
     332             :  *
     333             :  * First member: ras_debug_if::head or ras_debug_if::inject.
     334             :  *
     335             :  * head is used to indicate which IP block will be under control.
     336             :  *
     337             :  * head has four members, they are block, type, sub_block_index, name.
     338             :  * block: which IP will be under control.
     339             :  * type: what kind of error will be enabled/disabled/injected.
     340             :  * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
     341             :  * name: the name of IP.
     342             :  *
     343             :  * inject has two more members than head, they are address, value.
     344             :  * As their names indicate, inject operation will write the
     345             :  * value to the address.
     346             :  *
     347             :  * The second member: struct ras_debug_if::op.
     348             :  * It has three kinds of operations.
     349             :  *
     350             :  * - 0: disable RAS on the block. Take ::head as its data.
     351             :  * - 1: enable RAS on the block. Take ::head as its data.
     352             :  * - 2: inject errors on the block. Take ::inject as its data.
     353             :  *
     354             :  * How to use the interface?
     355             :  *
     356             :  * In a program
     357             :  *
     358             :  * Copy the struct ras_debug_if in your code and initialize it.
     359             :  * Write the struct to the control interface.
     360             :  *
     361             :  * From shell
     362             :  *
     363             :  * .. code-block:: bash
     364             :  *
     365             :  *      echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
     366             :  *      echo "enable  <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
     367             :  *      echo "inject  <block> <error> <sub-block> <address> <value> > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
     368             :  *
     369             :  * Where N, is the card which you want to affect.
     370             :  *
     371             :  * "disable" requires only the block.
     372             :  * "enable" requires the block and error type.
     373             :  * "inject" requires the block, error type, address, and value.
     374             :  *
     375             :  * The block is one of: umc, sdma, gfx, etc.
     376             :  *      see ras_block_string[] for details
     377             :  *
     378             :  * The error type is one of: ue, ce, where,
     379             :  *      ue is multi-uncorrectable
     380             :  *      ce is single-correctable
     381             :  *
     382             :  * The sub-block is a the sub-block index, pass 0 if there is no sub-block.
     383             :  * The address and value are hexadecimal numbers, leading 0x is optional.
     384             :  *
     385             :  * For instance,
     386             :  *
     387             :  * .. code-block:: bash
     388             :  *
     389             :  *      echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
     390             :  *      echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
     391             :  *      echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
     392             :  *
     393             :  * How to check the result of the operation?
     394             :  *
     395             :  * To check disable/enable, see "ras" features at,
     396             :  * /sys/class/drm/card[0/1/2...]/device/ras/features
     397             :  *
     398             :  * To check inject, see the corresponding error count at,
     399             :  * /sys/class/drm/card[0/1/2...]/device/ras/[gfx|sdma|umc|...]_err_count
     400             :  *
     401             :  * .. note::
     402             :  *      Operations are only allowed on blocks which are supported.
     403             :  *      Check the "ras" mask at /sys/module/amdgpu/parameters/ras_mask
     404             :  *      to see which blocks support RAS on a particular asic.
     405             :  *
     406             :  */
     407             : static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
     408             :                                              const char __user *buf,
     409             :                                              size_t size, loff_t *pos)
     410             : {
     411             :         struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
     412             :         struct ras_debug_if data;
     413             :         int ret = 0;
     414             : 
     415             :         if (!amdgpu_ras_get_error_query_ready(adev)) {
     416             :                 dev_warn(adev->dev, "RAS WARN: error injection "
     417             :                                 "currently inaccessible\n");
     418             :                 return size;
     419             :         }
     420             : 
     421             :         ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
     422             :         if (ret)
     423             :                 return ret;
     424             : 
     425             :         if (data.op == 3) {
     426             :                 ret = amdgpu_reserve_page_direct(adev, data.inject.address);
     427             :                 if (!ret)
     428             :                         return size;
     429             :                 else
     430             :                         return ret;
     431             :         }
     432             : 
     433             :         if (!amdgpu_ras_is_supported(adev, data.head.block))
     434             :                 return -EINVAL;
     435             : 
     436             :         switch (data.op) {
     437             :         case 0:
     438             :                 ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
     439             :                 break;
     440             :         case 1:
     441             :                 ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
     442             :                 break;
     443             :         case 2:
     444             :                 if ((data.inject.address >= adev->gmc.mc_vram_size) ||
     445             :                     (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
     446             :                         dev_warn(adev->dev, "RAS WARN: input address "
     447             :                                         "0x%llx is invalid.",
     448             :                                         data.inject.address);
     449             :                         ret = -EINVAL;
     450             :                         break;
     451             :                 }
     452             : 
     453             :                 /* umc ce/ue error injection for a bad page is not allowed */
     454             :                 if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
     455             :                     amdgpu_ras_check_bad_page(adev, data.inject.address)) {
     456             :                         dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has "
     457             :                                  "already been marked as bad!\n",
     458             :                                  data.inject.address);
     459             :                         break;
     460             :                 }
     461             : 
     462             :                 /* data.inject.address is offset instead of absolute gpu address */
     463             :                 ret = amdgpu_ras_error_inject(adev, &data.inject);
     464             :                 break;
     465             :         default:
     466             :                 ret = -EINVAL;
     467             :                 break;
     468             :         }
     469             : 
     470             :         if (ret)
     471             :                 return ret;
     472             : 
     473             :         return size;
     474             : }
     475             : 
     476             : /**
     477             :  * DOC: AMDGPU RAS debugfs EEPROM table reset interface
     478             :  *
     479             :  * Some boards contain an EEPROM which is used to persistently store a list of
     480             :  * bad pages which experiences ECC errors in vram.  This interface provides
     481             :  * a way to reset the EEPROM, e.g., after testing error injection.
     482             :  *
     483             :  * Usage:
     484             :  *
     485             :  * .. code-block:: bash
     486             :  *
     487             :  *      echo 1 > ../ras/ras_eeprom_reset
     488             :  *
     489             :  * will reset EEPROM table to 0 entries.
     490             :  *
     491             :  */
     492             : static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f,
     493             :                                                const char __user *buf,
     494             :                                                size_t size, loff_t *pos)
     495             : {
     496             :         struct amdgpu_device *adev =
     497             :                 (struct amdgpu_device *)file_inode(f)->i_private;
     498             :         int ret;
     499             : 
     500             :         ret = amdgpu_ras_eeprom_reset_table(
     501             :                 &(amdgpu_ras_get_context(adev)->eeprom_control));
     502             : 
     503             :         if (!ret) {
     504             :                 /* Something was written to EEPROM.
     505             :                  */
     506             :                 amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS;
     507             :                 return size;
     508             :         } else {
     509             :                 return ret;
     510             :         }
     511             : }
     512             : 
     513             : static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
     514             :         .owner = THIS_MODULE,
     515             :         .read = NULL,
     516             :         .write = amdgpu_ras_debugfs_ctrl_write,
     517             :         .llseek = default_llseek
     518             : };
     519             : 
     520             : static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
     521             :         .owner = THIS_MODULE,
     522             :         .read = NULL,
     523             :         .write = amdgpu_ras_debugfs_eeprom_write,
     524             :         .llseek = default_llseek
     525             : };
     526             : 
     527             : /**
     528             :  * DOC: AMDGPU RAS sysfs Error Count Interface
     529             :  *
     530             :  * It allows the user to read the error count for each IP block on the gpu through
     531             :  * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
     532             :  *
     533             :  * It outputs the multiple lines which report the uncorrected (ue) and corrected
     534             :  * (ce) error counts.
     535             :  *
     536             :  * The format of one line is below,
     537             :  *
     538             :  * [ce|ue]: count
     539             :  *
     540             :  * Example:
     541             :  *
     542             :  * .. code-block:: bash
     543             :  *
     544             :  *      ue: 0
     545             :  *      ce: 1
     546             :  *
     547             :  */
     548           0 : static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
     549             :                 struct device_attribute *attr, char *buf)
     550             : {
     551           0 :         struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
     552           0 :         struct ras_query_if info = {
     553             :                 .head = obj->head,
     554             :         };
     555             : 
     556           0 :         if (!amdgpu_ras_get_error_query_ready(obj->adev))
     557           0 :                 return sysfs_emit(buf, "Query currently inaccessible\n");
     558             : 
     559           0 :         if (amdgpu_ras_query_error_status(obj->adev, &info))
     560             :                 return -EINVAL;
     561             : 
     562           0 :         if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
     563             :             obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
     564           0 :                 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
     565           0 :                         dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
     566             :         }
     567             : 
     568           0 :         return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
     569             :                           "ce", info.ce_count);
     570             : }
     571             : 
     572             : /* obj begin */
     573             : 
     574             : #define get_obj(obj) do { (obj)->use++; } while (0)
     575             : #define alive_obj(obj) ((obj)->use)
     576             : 
     577           0 : static inline void put_obj(struct ras_manager *obj)
     578             : {
     579           0 :         if (obj && (--obj->use == 0))
     580           0 :                 list_del(&obj->node);
     581           0 :         if (obj && (obj->use < 0))
     582           0 :                 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head));
     583           0 : }
     584             : 
     585             : /* make one obj and return it. */
     586           0 : static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
     587             :                 struct ras_common_if *head)
     588             : {
     589           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
     590             :         struct ras_manager *obj;
     591             : 
     592           0 :         if (!adev->ras_enabled || !con)
     593             :                 return NULL;
     594             : 
     595           0 :         if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
     596             :                 return NULL;
     597             : 
     598           0 :         if (head->block == AMDGPU_RAS_BLOCK__MCA) {
     599           0 :                 if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
     600             :                         return NULL;
     601             : 
     602           0 :                 obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
     603             :         } else
     604           0 :                 obj = &con->objs[head->block];
     605             : 
     606             :         /* already exist. return obj? */
     607           0 :         if (alive_obj(obj))
     608             :                 return NULL;
     609             : 
     610           0 :         obj->head = *head;
     611           0 :         obj->adev = adev;
     612           0 :         list_add(&obj->node, &con->head);
     613           0 :         get_obj(obj);
     614             : 
     615           0 :         return obj;
     616             : }
     617             : 
     618             : /* return an obj equal to head, or the first when head is NULL */
     619           0 : struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
     620             :                 struct ras_common_if *head)
     621             : {
     622           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
     623             :         struct ras_manager *obj;
     624             :         int i;
     625             : 
     626           0 :         if (!adev->ras_enabled || !con)
     627             :                 return NULL;
     628             : 
     629           0 :         if (head) {
     630           0 :                 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
     631             :                         return NULL;
     632             : 
     633           0 :                 if (head->block == AMDGPU_RAS_BLOCK__MCA) {
     634           0 :                         if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
     635             :                                 return NULL;
     636             : 
     637           0 :                         obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
     638             :                 } else
     639           0 :                         obj = &con->objs[head->block];
     640             : 
     641           0 :                 if (alive_obj(obj))
     642             :                         return obj;
     643             :         } else {
     644           0 :                 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT + AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
     645           0 :                         obj = &con->objs[i];
     646           0 :                         if (alive_obj(obj))
     647             :                                 return obj;
     648             :                 }
     649             :         }
     650             : 
     651           0 :         return NULL;
     652             : }
     653             : /* obj end */
     654             : 
     655             : /* feature ctl begin */
     656             : static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
     657             :                                          struct ras_common_if *head)
     658             : {
     659           0 :         return adev->ras_hw_enabled & BIT(head->block);
     660             : }
     661             : 
     662             : static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
     663             :                 struct ras_common_if *head)
     664             : {
     665           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
     666             : 
     667           0 :         return con->features & BIT(head->block);
     668             : }
     669             : 
     670             : /*
     671             :  * if obj is not created, then create one.
     672             :  * set feature enable flag.
     673             :  */
     674           0 : static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
     675             :                 struct ras_common_if *head, int enable)
     676             : {
     677           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
     678           0 :         struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
     679             : 
     680             :         /* If hardware does not support ras, then do not create obj.
     681             :          * But if hardware support ras, we can create the obj.
     682             :          * Ras framework checks con->hw_supported to see if it need do
     683             :          * corresponding initialization.
     684             :          * IP checks con->support to see if it need disable ras.
     685             :          */
     686           0 :         if (!amdgpu_ras_is_feature_allowed(adev, head))
     687             :                 return 0;
     688             : 
     689           0 :         if (enable) {
     690           0 :                 if (!obj) {
     691           0 :                         obj = amdgpu_ras_create_obj(adev, head);
     692           0 :                         if (!obj)
     693             :                                 return -EINVAL;
     694             :                 } else {
     695             :                         /* In case we create obj somewhere else */
     696           0 :                         get_obj(obj);
     697             :                 }
     698           0 :                 con->features |= BIT(head->block);
     699             :         } else {
     700           0 :                 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
     701           0 :                         con->features &= ~BIT(head->block);
     702           0 :                         put_obj(obj);
     703             :                 }
     704             :         }
     705             : 
     706             :         return 0;
     707             : }
     708             : 
     709             : /* wrapper of psp_ras_enable_features */
     710           0 : int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
     711             :                 struct ras_common_if *head, bool enable)
     712             : {
     713           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
     714             :         union ta_ras_cmd_input *info;
     715             :         int ret;
     716             : 
     717           0 :         if (!con)
     718             :                 return -EINVAL;
     719             : 
     720           0 :         if (head->block == AMDGPU_RAS_BLOCK__GFX) {
     721           0 :                 info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);
     722           0 :                 if (!info)
     723             :                         return -ENOMEM;
     724             : 
     725           0 :                 if (!enable) {
     726           0 :                         info->disable_features = (struct ta_ras_disable_features_input) {
     727           0 :                                 .block_id =  amdgpu_ras_block_to_ta(head->block),
     728           0 :                                 .error_type = amdgpu_ras_error_to_ta(head->type),
     729             :                         };
     730             :                 } else {
     731           0 :                         info->enable_features = (struct ta_ras_enable_features_input) {
     732           0 :                                 .block_id =  amdgpu_ras_block_to_ta(head->block),
     733           0 :                                 .error_type = amdgpu_ras_error_to_ta(head->type),
     734             :                         };
     735             :                 }
     736             :         }
     737             : 
     738             :         /* Do not enable if it is not allowed. */
     739           0 :         WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
     740             : 
     741             :         /* Only enable ras feature operation handle on host side */
     742           0 :         if (head->block == AMDGPU_RAS_BLOCK__GFX &&
     743           0 :                 !amdgpu_sriov_vf(adev) &&
     744             :                 !amdgpu_ras_intr_triggered()) {
     745           0 :                 ret = psp_ras_enable_features(&adev->psp, info, enable);
     746           0 :                 if (ret) {
     747           0 :                         dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n",
     748             :                                 enable ? "enable":"disable",
     749             :                                 get_ras_block_str(head),
     750             :                                 amdgpu_ras_is_poison_mode_supported(adev), ret);
     751           0 :                         goto out;
     752             :                 }
     753             :         }
     754             : 
     755             :         /* setup the obj */
     756           0 :         __amdgpu_ras_feature_enable(adev, head, enable);
     757           0 :         ret = 0;
     758             : out:
     759           0 :         if (head->block == AMDGPU_RAS_BLOCK__GFX)
     760           0 :                 kfree(info);
     761             :         return ret;
     762             : }
     763             : 
     764             : /* Only used in device probe stage and called only once. */
     765           0 : int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
     766             :                 struct ras_common_if *head, bool enable)
     767             : {
     768           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
     769             :         int ret;
     770             : 
     771           0 :         if (!con)
     772             :                 return -EINVAL;
     773             : 
     774           0 :         if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
     775           0 :                 if (enable) {
     776             :                         /* There is no harm to issue a ras TA cmd regardless of
     777             :                          * the currecnt ras state.
     778             :                          * If current state == target state, it will do nothing
     779             :                          * But sometimes it requests driver to reset and repost
     780             :                          * with error code -EAGAIN.
     781             :                          */
     782           0 :                         ret = amdgpu_ras_feature_enable(adev, head, 1);
     783             :                         /* With old ras TA, we might fail to enable ras.
     784             :                          * Log it and just setup the object.
     785             :                          * TODO need remove this WA in the future.
     786             :                          */
     787           0 :                         if (ret == -EINVAL) {
     788           0 :                                 ret = __amdgpu_ras_feature_enable(adev, head, 1);
     789           0 :                                 if (!ret)
     790           0 :                                         dev_info(adev->dev,
     791             :                                                 "RAS INFO: %s setup object\n",
     792             :                                                 get_ras_block_str(head));
     793             :                         }
     794             :                 } else {
     795             :                         /* setup the object then issue a ras TA disable cmd.*/
     796           0 :                         ret = __amdgpu_ras_feature_enable(adev, head, 1);
     797           0 :                         if (ret)
     798             :                                 return ret;
     799             : 
     800             :                         /* gfx block ras dsiable cmd must send to ras-ta */
     801           0 :                         if (head->block == AMDGPU_RAS_BLOCK__GFX)
     802           0 :                                 con->features |= BIT(head->block);
     803             : 
     804           0 :                         ret = amdgpu_ras_feature_enable(adev, head, 0);
     805             : 
     806             :                         /* clean gfx block ras features flag */
     807           0 :                         if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX)
     808           0 :                                 con->features &= ~BIT(head->block);
     809             :                 }
     810             :         } else
     811           0 :                 ret = amdgpu_ras_feature_enable(adev, head, enable);
     812             : 
     813             :         return ret;
     814             : }
     815             : 
     816           0 : static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
     817             :                 bool bypass)
     818             : {
     819           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
     820             :         struct ras_manager *obj, *tmp;
     821             : 
     822           0 :         list_for_each_entry_safe(obj, tmp, &con->head, node) {
     823             :                 /* bypass psp.
     824             :                  * aka just release the obj and corresponding flags
     825             :                  */
     826           0 :                 if (bypass) {
     827           0 :                         if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
     828             :                                 break;
     829             :                 } else {
     830           0 :                         if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
     831             :                                 break;
     832             :                 }
     833             :         }
     834             : 
     835           0 :         return con->features;
     836             : }
     837             : 
     838           0 : static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
     839             :                 bool bypass)
     840             : {
     841           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
     842             :         int i;
     843           0 :         const enum amdgpu_ras_error_type default_ras_type = AMDGPU_RAS_ERROR__NONE;
     844             : 
     845           0 :         for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
     846           0 :                 struct ras_common_if head = {
     847             :                         .block = i,
     848             :                         .type = default_ras_type,
     849             :                         .sub_block_index = 0,
     850             :                 };
     851             : 
     852           0 :                 if (i == AMDGPU_RAS_BLOCK__MCA)
     853           0 :                         continue;
     854             : 
     855           0 :                 if (bypass) {
     856             :                         /*
     857             :                          * bypass psp. vbios enable ras for us.
     858             :                          * so just create the obj
     859             :                          */
     860           0 :                         if (__amdgpu_ras_feature_enable(adev, &head, 1))
     861             :                                 break;
     862             :                 } else {
     863           0 :                         if (amdgpu_ras_feature_enable(adev, &head, 1))
     864             :                                 break;
     865             :                 }
     866             :         }
     867             : 
     868           0 :         for (i = 0; i < AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
     869           0 :                 struct ras_common_if head = {
     870             :                         .block = AMDGPU_RAS_BLOCK__MCA,
     871             :                         .type = default_ras_type,
     872             :                         .sub_block_index = i,
     873             :                 };
     874             : 
     875           0 :                 if (bypass) {
     876             :                         /*
     877             :                          * bypass psp. vbios enable ras for us.
     878             :                          * so just create the obj
     879             :                          */
     880           0 :                         if (__amdgpu_ras_feature_enable(adev, &head, 1))
     881             :                                 break;
     882             :                 } else {
     883           0 :                         if (amdgpu_ras_feature_enable(adev, &head, 1))
     884             :                                 break;
     885             :                 }
     886             :         }
     887             : 
     888           0 :         return con->features;
     889             : }
     890             : /* feature ctl end */
     891             : 
     892             : static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object *block_obj,
     893             :                 enum amdgpu_ras_block block)
     894             : {
     895             :         if (!block_obj)
     896             :                 return -EINVAL;
     897             : 
     898           0 :         if (block_obj->ras_comm.block == block)
     899             :                 return 0;
     900             : 
     901             :         return -EINVAL;
     902             : }
     903             : 
     904           0 : static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_device *adev,
     905             :                                         enum amdgpu_ras_block block, uint32_t sub_block_index)
     906             : {
     907             :         struct amdgpu_ras_block_list *node, *tmp;
     908             :         struct amdgpu_ras_block_object *obj;
     909             : 
     910           0 :         if (block >= AMDGPU_RAS_BLOCK__LAST)
     911             :                 return NULL;
     912             : 
     913           0 :         if (!amdgpu_ras_is_supported(adev, block))
     914             :                 return NULL;
     915             : 
     916           0 :         list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
     917           0 :                 if (!node->ras_obj) {
     918           0 :                         dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
     919           0 :                         continue;
     920             :                 }
     921             : 
     922           0 :                 obj = node->ras_obj;
     923           0 :                 if (obj->ras_block_match) {
     924           0 :                         if (obj->ras_block_match(obj, block, sub_block_index) == 0)
     925             :                                 return obj;
     926             :                 } else {
     927           0 :                         if (amdgpu_ras_block_match_default(obj, block) == 0)
     928             :                                 return obj;
     929             :                 }
     930             :         }
     931             : 
     932             :         return NULL;
     933             : }
     934             : 
     935           0 : static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data)
     936             : {
     937           0 :         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
     938           0 :         int ret = 0;
     939             : 
     940             :         /*
     941             :          * choosing right query method according to
     942             :          * whether smu support query error information
     943             :          */
     944           0 :         ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc));
     945           0 :         if (ret == -EOPNOTSUPP) {
     946           0 :                 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
     947           0 :                         adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
     948           0 :                         adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data);
     949             : 
     950             :                 /* umc query_ras_error_address is also responsible for clearing
     951             :                  * error status
     952             :                  */
     953           0 :                 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
     954           0 :                     adev->umc.ras->ras_block.hw_ops->query_ras_error_address)
     955           0 :                         adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data);
     956           0 :         } else if (!ret) {
     957           0 :                 if (adev->umc.ras &&
     958           0 :                         adev->umc.ras->ecc_info_query_ras_error_count)
     959           0 :                         adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data);
     960             : 
     961           0 :                 if (adev->umc.ras &&
     962           0 :                         adev->umc.ras->ecc_info_query_ras_error_address)
     963           0 :                         adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data);
     964             :         }
     965           0 : }
     966             : 
     967             : /* query/inject/cure begin */
     968           0 : int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
     969             :                                   struct ras_query_if *info)
     970             : {
     971           0 :         struct amdgpu_ras_block_object *block_obj = NULL;
     972           0 :         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
     973           0 :         struct ras_err_data err_data = {0, 0, 0, NULL};
     974             : 
     975           0 :         if (!obj)
     976             :                 return -EINVAL;
     977             : 
     978           0 :         if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
     979           0 :                 amdgpu_ras_get_ecc_info(adev, &err_data);
     980             :         } else {
     981           0 :                 block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
     982           0 :                 if (!block_obj || !block_obj->hw_ops)   {
     983           0 :                         dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
     984             :                                      get_ras_block_str(&info->head));
     985             :                         return -EINVAL;
     986             :                 }
     987             : 
     988           0 :                 if (block_obj->hw_ops->query_ras_error_count)
     989           0 :                         block_obj->hw_ops->query_ras_error_count(adev, &err_data);
     990             : 
     991           0 :                 if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
     992           0 :                     (info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
     993             :                     (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
     994           0 :                                 if (block_obj->hw_ops->query_ras_error_status)
     995           0 :                                         block_obj->hw_ops->query_ras_error_status(adev);
     996             :                         }
     997             :         }
     998             : 
     999           0 :         obj->err_data.ue_count += err_data.ue_count;
    1000           0 :         obj->err_data.ce_count += err_data.ce_count;
    1001             : 
    1002           0 :         info->ue_count = obj->err_data.ue_count;
    1003           0 :         info->ce_count = obj->err_data.ce_count;
    1004             : 
    1005           0 :         if (err_data.ce_count) {
    1006           0 :                 if (adev->smuio.funcs &&
    1007           0 :                     adev->smuio.funcs->get_socket_id &&
    1008           0 :                     adev->smuio.funcs->get_die_id) {
    1009           0 :                         dev_info(adev->dev, "socket: %d, die: %d "
    1010             :                                         "%ld correctable hardware errors "
    1011             :                                         "detected in %s block, no user "
    1012             :                                         "action is needed.\n",
    1013             :                                         adev->smuio.funcs->get_socket_id(adev),
    1014             :                                         adev->smuio.funcs->get_die_id(adev),
    1015             :                                         obj->err_data.ce_count,
    1016             :                                         get_ras_block_str(&info->head));
    1017             :                 } else {
    1018           0 :                         dev_info(adev->dev, "%ld correctable hardware errors "
    1019             :                                         "detected in %s block, no user "
    1020             :                                         "action is needed.\n",
    1021             :                                         obj->err_data.ce_count,
    1022             :                                         get_ras_block_str(&info->head));
    1023             :                 }
    1024             :         }
    1025           0 :         if (err_data.ue_count) {
    1026           0 :                 if (adev->smuio.funcs &&
    1027           0 :                     adev->smuio.funcs->get_socket_id &&
    1028           0 :                     adev->smuio.funcs->get_die_id) {
    1029           0 :                         dev_info(adev->dev, "socket: %d, die: %d "
    1030             :                                         "%ld uncorrectable hardware errors "
    1031             :                                         "detected in %s block\n",
    1032             :                                         adev->smuio.funcs->get_socket_id(adev),
    1033             :                                         adev->smuio.funcs->get_die_id(adev),
    1034             :                                         obj->err_data.ue_count,
    1035             :                                         get_ras_block_str(&info->head));
    1036             :                 } else {
    1037           0 :                         dev_info(adev->dev, "%ld uncorrectable hardware errors "
    1038             :                                         "detected in %s block\n",
    1039             :                                         obj->err_data.ue_count,
    1040             :                                         get_ras_block_str(&info->head));
    1041             :                 }
    1042             :         }
    1043             : 
    1044             :         return 0;
    1045             : }
    1046             : 
    1047           0 : int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
    1048             :                 enum amdgpu_ras_block block)
    1049             : {
    1050           0 :         struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0);
    1051             : 
    1052           0 :         if (!amdgpu_ras_is_supported(adev, block))
    1053             :                 return -EINVAL;
    1054             : 
    1055           0 :         if (!block_obj || !block_obj->hw_ops)   {
    1056           0 :                 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
    1057             :                              ras_block_str(block));
    1058             :                 return -EINVAL;
    1059             :         }
    1060             : 
    1061           0 :         if (block_obj->hw_ops->reset_ras_error_count)
    1062           0 :                 block_obj->hw_ops->reset_ras_error_count(adev);
    1063             : 
    1064           0 :         if ((block == AMDGPU_RAS_BLOCK__GFX) ||
    1065             :             (block == AMDGPU_RAS_BLOCK__MMHUB)) {
    1066           0 :                 if (block_obj->hw_ops->reset_ras_error_status)
    1067           0 :                         block_obj->hw_ops->reset_ras_error_status(adev);
    1068             :         }
    1069             : 
    1070             :         return 0;
    1071             : }
    1072             : 
    1073             : /* wrapper of psp_ras_trigger_error */
    1074           0 : int amdgpu_ras_error_inject(struct amdgpu_device *adev,
    1075             :                 struct ras_inject_if *info)
    1076             : {
    1077           0 :         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
    1078           0 :         struct ta_ras_trigger_error_input block_info = {
    1079           0 :                 .block_id =  amdgpu_ras_block_to_ta(info->head.block),
    1080           0 :                 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
    1081           0 :                 .sub_block_index = info->head.sub_block_index,
    1082           0 :                 .address = info->address,
    1083           0 :                 .value = info->value,
    1084             :         };
    1085           0 :         int ret = -EINVAL;
    1086           0 :         struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev,
    1087             :                                                         info->head.block,
    1088             :                                                         info->head.sub_block_index);
    1089             : 
    1090           0 :         if (!obj)
    1091             :                 return -EINVAL;
    1092             : 
    1093           0 :         if (!block_obj || !block_obj->hw_ops)        {
    1094           0 :                 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
    1095             :                              get_ras_block_str(&info->head));
    1096             :                 return -EINVAL;
    1097             :         }
    1098             : 
    1099             :         /* Calculate XGMI relative offset */
    1100           0 :         if (adev->gmc.xgmi.num_physical_nodes > 1) {
    1101           0 :                 block_info.address =
    1102           0 :                         amdgpu_xgmi_get_relative_phy_addr(adev,
    1103             :                                                           block_info.address);
    1104             :         }
    1105             : 
    1106           0 :         if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
    1107           0 :                 if (block_obj->hw_ops->ras_error_inject)
    1108           0 :                         ret = block_obj->hw_ops->ras_error_inject(adev, info);
    1109             :         } else {
    1110             :                 /* If defined special ras_error_inject(e.g: xgmi), implement special ras_error_inject */
    1111           0 :                 if (block_obj->hw_ops->ras_error_inject)
    1112           0 :                         ret = block_obj->hw_ops->ras_error_inject(adev, &block_info);
    1113             :                 else  /*If not defined .ras_error_inject, use default ras_error_inject*/
    1114           0 :                         ret = psp_ras_trigger_error(&adev->psp, &block_info);
    1115             :         }
    1116             : 
    1117           0 :         if (ret)
    1118           0 :                 dev_err(adev->dev, "ras inject %s failed %d\n",
    1119             :                         get_ras_block_str(&info->head), ret);
    1120             : 
    1121             :         return ret;
    1122             : }
    1123             : 
    1124             : /**
    1125             :  * amdgpu_ras_query_error_count -- Get error counts of all IPs
    1126             :  * @adev: pointer to AMD GPU device
    1127             :  * @ce_count: pointer to an integer to be set to the count of correctible errors.
    1128             :  * @ue_count: pointer to an integer to be set to the count of uncorrectible
    1129             :  * errors.
    1130             :  *
    1131             :  * If set, @ce_count or @ue_count, count and return the corresponding
    1132             :  * error counts in those integer pointers. Return 0 if the device
    1133             :  * supports RAS. Return -EOPNOTSUPP if the device doesn't support RAS.
    1134             :  */
    1135           0 : int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
    1136             :                                  unsigned long *ce_count,
    1137             :                                  unsigned long *ue_count)
    1138             : {
    1139           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    1140             :         struct ras_manager *obj;
    1141             :         unsigned long ce, ue;
    1142             : 
    1143           0 :         if (!adev->ras_enabled || !con)
    1144             :                 return -EOPNOTSUPP;
    1145             : 
    1146             :         /* Don't count since no reporting.
    1147             :          */
    1148           0 :         if (!ce_count && !ue_count)
    1149             :                 return 0;
    1150             : 
    1151           0 :         ce = 0;
    1152           0 :         ue = 0;
    1153           0 :         list_for_each_entry(obj, &con->head, node) {
    1154           0 :                 struct ras_query_if info = {
    1155             :                         .head = obj->head,
    1156             :                 };
    1157             :                 int res;
    1158             : 
    1159           0 :                 res = amdgpu_ras_query_error_status(adev, &info);
    1160           0 :                 if (res)
    1161           0 :                         return res;
    1162             : 
    1163           0 :                 if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
    1164             :                     adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
    1165           0 :                         if (amdgpu_ras_reset_error_status(adev, info.head.block))
    1166           0 :                                 dev_warn(adev->dev, "Failed to reset error counter and error status");
    1167             :                 }
    1168             : 
    1169           0 :                 ce += info.ce_count;
    1170           0 :                 ue += info.ue_count;
    1171             :         }
    1172             : 
    1173           0 :         if (ce_count)
    1174           0 :                 *ce_count = ce;
    1175             : 
    1176           0 :         if (ue_count)
    1177           0 :                 *ue_count = ue;
    1178             : 
    1179             :         return 0;
    1180             : }
    1181             : /* query/inject/cure end */
    1182             : 
    1183             : 
    1184             : /* sysfs begin */
    1185             : 
    1186             : static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
    1187             :                 struct ras_badpage **bps, unsigned int *count);
    1188             : 
    1189             : static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
    1190             : {
    1191           0 :         switch (flags) {
    1192             :         case AMDGPU_RAS_RETIRE_PAGE_RESERVED:
    1193             :                 return "R";
    1194             :         case AMDGPU_RAS_RETIRE_PAGE_PENDING:
    1195             :                 return "P";
    1196             :         case AMDGPU_RAS_RETIRE_PAGE_FAULT:
    1197             :         default:
    1198             :                 return "F";
    1199             :         }
    1200             : }
    1201             : 
    1202             : /**
    1203             :  * DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface
    1204             :  *
    1205             :  * It allows user to read the bad pages of vram on the gpu through
    1206             :  * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
    1207             :  *
    1208             :  * It outputs multiple lines, and each line stands for one gpu page.
    1209             :  *
    1210             :  * The format of one line is below,
    1211             :  * gpu pfn : gpu page size : flags
    1212             :  *
    1213             :  * gpu pfn and gpu page size are printed in hex format.
    1214             :  * flags can be one of below character,
    1215             :  *
    1216             :  * R: reserved, this gpu page is reserved and not able to use.
    1217             :  *
    1218             :  * P: pending for reserve, this gpu page is marked as bad, will be reserved
    1219             :  * in next window of page_reserve.
    1220             :  *
    1221             :  * F: unable to reserve. this gpu page can't be reserved due to some reasons.
    1222             :  *
    1223             :  * Examples:
    1224             :  *
    1225             :  * .. code-block:: bash
    1226             :  *
    1227             :  *      0x00000001 : 0x00001000 : R
    1228             :  *      0x00000002 : 0x00001000 : P
    1229             :  *
    1230             :  */
    1231             : 
    1232           0 : static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
    1233             :                 struct kobject *kobj, struct bin_attribute *attr,
    1234             :                 char *buf, loff_t ppos, size_t count)
    1235             : {
    1236           0 :         struct amdgpu_ras *con =
    1237           0 :                 container_of(attr, struct amdgpu_ras, badpages_attr);
    1238           0 :         struct amdgpu_device *adev = con->adev;
    1239           0 :         const unsigned int element_size =
    1240             :                 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
    1241           0 :         unsigned int start = div64_ul(ppos + element_size - 1, element_size);
    1242           0 :         unsigned int end = div64_ul(ppos + count - 1, element_size);
    1243           0 :         ssize_t s = 0;
    1244           0 :         struct ras_badpage *bps = NULL;
    1245           0 :         unsigned int bps_count = 0;
    1246             : 
    1247           0 :         memset(buf, 0, count);
    1248             : 
    1249           0 :         if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
    1250             :                 return 0;
    1251             : 
    1252           0 :         for (; start < end && start < bps_count; start++)
    1253           0 :                 s += scnprintf(&buf[s], element_size + 1,
    1254             :                                 "0x%08x : 0x%08x : %1s\n",
    1255             :                                 bps[start].bp,
    1256             :                                 bps[start].size,
    1257           0 :                                 amdgpu_ras_badpage_flags_str(bps[start].flags));
    1258             : 
    1259           0 :         kfree(bps);
    1260             : 
    1261           0 :         return s;
    1262             : }
    1263             : 
    1264           0 : static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
    1265             :                 struct device_attribute *attr, char *buf)
    1266             : {
    1267           0 :         struct amdgpu_ras *con =
    1268           0 :                 container_of(attr, struct amdgpu_ras, features_attr);
    1269             : 
    1270           0 :         return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
    1271             : }
    1272             : 
    1273             : static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev)
    1274             : {
    1275           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    1276             : 
    1277           0 :         sysfs_remove_file_from_group(&adev->dev->kobj,
    1278           0 :                                 &con->badpages_attr.attr,
    1279             :                                 RAS_FS_NAME);
    1280             : }
    1281             : 
    1282           0 : static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
    1283             : {
    1284           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    1285           0 :         struct attribute *attrs[] = {
    1286           0 :                 &con->features_attr.attr,
    1287             :                 NULL
    1288             :         };
    1289           0 :         struct attribute_group group = {
    1290             :                 .name = RAS_FS_NAME,
    1291             :                 .attrs = attrs,
    1292             :         };
    1293             : 
    1294           0 :         sysfs_remove_group(&adev->dev->kobj, &group);
    1295             : 
    1296           0 :         return 0;
    1297             : }
    1298             : 
    1299           0 : int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
    1300             :                 struct ras_common_if *head)
    1301             : {
    1302           0 :         struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
    1303             : 
    1304           0 :         if (!obj || obj->attr_inuse)
    1305             :                 return -EINVAL;
    1306             : 
    1307           0 :         get_obj(obj);
    1308             : 
    1309           0 :         snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name),
    1310           0 :                 "%s_err_count", head->name);
    1311             : 
    1312           0 :         obj->sysfs_attr = (struct device_attribute){
    1313             :                 .attr = {
    1314             :                         .name = obj->fs_data.sysfs_name,
    1315             :                         .mode = S_IRUGO,
    1316             :                 },
    1317             :                         .show = amdgpu_ras_sysfs_read,
    1318             :         };
    1319             :         sysfs_attr_init(&obj->sysfs_attr.attr);
    1320             : 
    1321           0 :         if (sysfs_add_file_to_group(&adev->dev->kobj,
    1322           0 :                                 &obj->sysfs_attr.attr,
    1323             :                                 RAS_FS_NAME)) {
    1324           0 :                 put_obj(obj);
    1325           0 :                 return -EINVAL;
    1326             :         }
    1327             : 
    1328           0 :         obj->attr_inuse = 1;
    1329             : 
    1330           0 :         return 0;
    1331             : }
    1332             : 
    1333           0 : int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
    1334             :                 struct ras_common_if *head)
    1335             : {
    1336           0 :         struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
    1337             : 
    1338           0 :         if (!obj || !obj->attr_inuse)
    1339             :                 return -EINVAL;
    1340             : 
    1341           0 :         sysfs_remove_file_from_group(&adev->dev->kobj,
    1342           0 :                                 &obj->sysfs_attr.attr,
    1343             :                                 RAS_FS_NAME);
    1344           0 :         obj->attr_inuse = 0;
    1345           0 :         put_obj(obj);
    1346             : 
    1347           0 :         return 0;
    1348             : }
    1349             : 
    1350           0 : static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
    1351             : {
    1352           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    1353             :         struct ras_manager *obj, *tmp;
    1354             : 
    1355           0 :         list_for_each_entry_safe(obj, tmp, &con->head, node) {
    1356           0 :                 amdgpu_ras_sysfs_remove(adev, &obj->head);
    1357             :         }
    1358             : 
    1359           0 :         if (amdgpu_bad_page_threshold != 0)
    1360             :                 amdgpu_ras_sysfs_remove_bad_page_node(adev);
    1361             : 
    1362           0 :         amdgpu_ras_sysfs_remove_feature_node(adev);
    1363             : 
    1364           0 :         return 0;
    1365             : }
    1366             : /* sysfs end */
    1367             : 
    1368             : /**
    1369             :  * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors
    1370             :  *
    1371             :  * Normally when there is an uncorrectable error, the driver will reset
    1372             :  * the GPU to recover.  However, in the event of an unrecoverable error,
    1373             :  * the driver provides an interface to reboot the system automatically
    1374             :  * in that event.
    1375             :  *
    1376             :  * The following file in debugfs provides that interface:
    1377             :  * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot
    1378             :  *
    1379             :  * Usage:
    1380             :  *
    1381             :  * .. code-block:: bash
    1382             :  *
    1383             :  *      echo true > .../ras/auto_reboot
    1384             :  *
    1385             :  */
    1386             : /* debugfs begin */
    1387             : static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
    1388             : {
    1389             :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    1390             :         struct drm_minor  *minor = adev_to_drm(adev)->primary;
    1391             :         struct dentry     *dir;
    1392             : 
    1393             :         dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root);
    1394             :         debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev,
    1395             :                             &amdgpu_ras_debugfs_ctrl_ops);
    1396             :         debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev,
    1397             :                             &amdgpu_ras_debugfs_eeprom_ops);
    1398             :         debugfs_create_u32("bad_page_cnt_threshold", 0444, dir,
    1399             :                            &con->bad_page_cnt_threshold);
    1400             :         debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled);
    1401             :         debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled);
    1402             :         debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev,
    1403             :                             &amdgpu_ras_debugfs_eeprom_size_ops);
    1404             :         con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table",
    1405             :                                                        S_IRUGO, dir, adev,
    1406             :                                                        &amdgpu_ras_debugfs_eeprom_table_ops);
    1407             :         amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control);
    1408             : 
    1409             :         /*
    1410             :          * After one uncorrectable error happens, usually GPU recovery will
    1411             :          * be scheduled. But due to the known problem in GPU recovery failing
    1412             :          * to bring GPU back, below interface provides one direct way to
    1413             :          * user to reboot system automatically in such case within
    1414             :          * ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery routine
    1415             :          * will never be called.
    1416             :          */
    1417             :         debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, dir, &con->reboot);
    1418             : 
    1419             :         /*
    1420             :          * User could set this not to clean up hardware's error count register
    1421             :          * of RAS IPs during ras recovery.
    1422             :          */
    1423             :         debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, dir,
    1424             :                             &con->disable_ras_err_cnt_harvest);
    1425             :         return dir;
    1426             : }
    1427             : 
    1428             : static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
    1429             :                                       struct ras_fs_if *head,
    1430             :                                       struct dentry *dir)
    1431             : {
    1432             :         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
    1433             : 
    1434             :         if (!obj || !dir)
    1435             :                 return;
    1436             : 
    1437             :         get_obj(obj);
    1438             : 
    1439             :         memcpy(obj->fs_data.debugfs_name,
    1440             :                         head->debugfs_name,
    1441             :                         sizeof(obj->fs_data.debugfs_name));
    1442             : 
    1443             :         debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir,
    1444             :                             obj, &amdgpu_ras_debugfs_ops);
    1445             : }
    1446             : 
    1447           0 : void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
    1448             : {
    1449           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    1450             :         struct dentry *dir;
    1451             :         struct ras_manager *obj;
    1452             :         struct ras_fs_if fs_info;
    1453             : 
    1454             :         /*
    1455             :          * it won't be called in resume path, no need to check
    1456             :          * suspend and gpu reset status
    1457             :          */
    1458             :         if (!IS_ENABLED(CONFIG_DEBUG_FS) || !con)
    1459             :                 return;
    1460             : 
    1461             :         dir = amdgpu_ras_debugfs_create_ctrl_node(adev);
    1462             : 
    1463             :         list_for_each_entry(obj, &con->head, node) {
    1464             :                 if (amdgpu_ras_is_supported(adev, obj->head.block) &&
    1465             :                         (obj->attr_inuse == 1)) {
    1466             :                         sprintf(fs_info.debugfs_name, "%s_err_inject",
    1467             :                                         get_ras_block_str(&obj->head));
    1468             :                         fs_info.head = obj->head;
    1469             :                         amdgpu_ras_debugfs_create(adev, &fs_info, dir);
    1470             :                 }
    1471             :         }
    1472             : }
    1473             : 
    1474             : /* debugfs end */
    1475             : 
    1476             : /* ras fs */
    1477             : static BIN_ATTR(gpu_vram_bad_pages, S_IRUGO,
    1478             :                 amdgpu_ras_sysfs_badpages_read, NULL, 0);
    1479             : static DEVICE_ATTR(features, S_IRUGO,
    1480             :                 amdgpu_ras_sysfs_features_read, NULL);
    1481           0 : static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
    1482             : {
    1483           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    1484           0 :         struct attribute_group group = {
    1485             :                 .name = RAS_FS_NAME,
    1486             :         };
    1487           0 :         struct attribute *attrs[] = {
    1488           0 :                 &con->features_attr.attr,
    1489             :                 NULL
    1490             :         };
    1491           0 :         struct bin_attribute *bin_attrs[] = {
    1492             :                 NULL,
    1493             :                 NULL,
    1494             :         };
    1495             :         int r;
    1496             : 
    1497             :         /* add features entry */
    1498           0 :         con->features_attr = dev_attr_features;
    1499           0 :         group.attrs = attrs;
    1500             :         sysfs_attr_init(attrs[0]);
    1501             : 
    1502           0 :         if (amdgpu_bad_page_threshold != 0) {
    1503             :                 /* add bad_page_features entry */
    1504           0 :                 bin_attr_gpu_vram_bad_pages.private = NULL;
    1505           0 :                 con->badpages_attr = bin_attr_gpu_vram_bad_pages;
    1506           0 :                 bin_attrs[0] = &con->badpages_attr;
    1507           0 :                 group.bin_attrs = bin_attrs;
    1508             :                 sysfs_bin_attr_init(bin_attrs[0]);
    1509             :         }
    1510             : 
    1511           0 :         r = sysfs_create_group(&adev->dev->kobj, &group);
    1512           0 :         if (r)
    1513           0 :                 dev_err(adev->dev, "Failed to create RAS sysfs group!");
    1514             : 
    1515           0 :         return 0;
    1516             : }
    1517             : 
    1518             : static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
    1519             : {
    1520           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    1521             :         struct ras_manager *con_obj, *ip_obj, *tmp;
    1522             : 
    1523             :         if (IS_ENABLED(CONFIG_DEBUG_FS)) {
    1524             :                 list_for_each_entry_safe(con_obj, tmp, &con->head, node) {
    1525             :                         ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head);
    1526             :                         if (ip_obj)
    1527             :                                 put_obj(ip_obj);
    1528             :                 }
    1529             :         }
    1530             : 
    1531           0 :         amdgpu_ras_sysfs_remove_all(adev);
    1532             :         return 0;
    1533             : }
    1534             : /* ras fs end */
    1535             : 
    1536             : /* ih begin */
    1537             : 
    1538             : /* For the hardware that cannot enable bif ring for both ras_controller_irq
    1539             :  * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status
    1540             :  * register to check whether the interrupt is triggered or not, and properly
    1541             :  * ack the interrupt if it is there
    1542             :  */
    1543           0 : void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
    1544             : {
    1545             :         /* Fatal error events are handled on host side */
    1546           0 :         if (amdgpu_sriov_vf(adev) ||
    1547           0 :                 !amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF))
    1548             :                 return;
    1549             : 
    1550           0 :         if (adev->nbio.ras &&
    1551           0 :             adev->nbio.ras->handle_ras_controller_intr_no_bifring)
    1552           0 :                 adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev);
    1553             : 
    1554           0 :         if (adev->nbio.ras &&
    1555           0 :             adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring)
    1556           0 :                 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev);
    1557             : }
    1558             : 
    1559           0 : static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj,
    1560             :                                 struct amdgpu_iv_entry *entry)
    1561             : {
    1562           0 :         bool poison_stat = false;
    1563           0 :         struct amdgpu_device *adev = obj->adev;
    1564           0 :         struct ras_err_data err_data = {0, 0, 0, NULL};
    1565           0 :         struct amdgpu_ras_block_object *block_obj =
    1566           0 :                 amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
    1567             : 
    1568           0 :         if (!block_obj || !block_obj->hw_ops)
    1569           0 :                 return;
    1570             : 
    1571             :         /* both query_poison_status and handle_poison_consumption are optional,
    1572             :          * but at least one of them should be implemented if we need poison
    1573             :          * consumption handler
    1574             :          */
    1575           0 :         if (block_obj->hw_ops->query_poison_status) {
    1576           0 :                 poison_stat = block_obj->hw_ops->query_poison_status(adev);
    1577           0 :                 if (!poison_stat) {
    1578             :                         /* Not poison consumption interrupt, no need to handle it */
    1579           0 :                         dev_info(adev->dev, "No RAS poison status in %s poison IH.\n",
    1580             :                                         block_obj->ras_comm.name);
    1581             : 
    1582             :                         return;
    1583             :                 }
    1584             :         }
    1585             : 
    1586           0 :         if (!adev->gmc.xgmi.connected_to_cpu)
    1587           0 :                 amdgpu_umc_poison_handler(adev, &err_data, false);
    1588             : 
    1589           0 :         if (block_obj->hw_ops->handle_poison_consumption)
    1590           0 :                 poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
    1591             : 
    1592             :         /* gpu reset is fallback for failed and default cases */
    1593           0 :         if (poison_stat) {
    1594           0 :                 dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n",
    1595             :                                 block_obj->ras_comm.name);
    1596           0 :                 amdgpu_ras_reset_gpu(adev);
    1597             :         }
    1598             : }
    1599             : 
    1600             : static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj,
    1601             :                                 struct amdgpu_iv_entry *entry)
    1602             : {
    1603           0 :         dev_info(obj->adev->dev,
    1604             :                 "Poison is created, no user action is needed.\n");
    1605             : }
    1606             : 
    1607           0 : static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
    1608             :                                 struct amdgpu_iv_entry *entry)
    1609             : {
    1610           0 :         struct ras_ih_data *data = &obj->ih_data;
    1611           0 :         struct ras_err_data err_data = {0, 0, 0, NULL};
    1612             :         int ret;
    1613             : 
    1614           0 :         if (!data->cb)
    1615           0 :                 return;
    1616             : 
    1617             :         /* Let IP handle its data, maybe we need get the output
    1618             :          * from the callback to update the error type/count, etc
    1619             :          */
    1620           0 :         ret = data->cb(obj->adev, &err_data, entry);
    1621             :         /* ue will trigger an interrupt, and in that case
    1622             :          * we need do a reset to recovery the whole system.
    1623             :          * But leave IP do that recovery, here we just dispatch
    1624             :          * the error.
    1625             :          */
    1626           0 :         if (ret == AMDGPU_RAS_SUCCESS) {
    1627             :                 /* these counts could be left as 0 if
    1628             :                  * some blocks do not count error number
    1629             :                  */
    1630           0 :                 obj->err_data.ue_count += err_data.ue_count;
    1631           0 :                 obj->err_data.ce_count += err_data.ce_count;
    1632             :         }
    1633             : }
    1634             : 
    1635           0 : static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
    1636             : {
    1637           0 :         struct ras_ih_data *data = &obj->ih_data;
    1638             :         struct amdgpu_iv_entry entry;
    1639             : 
    1640           0 :         while (data->rptr != data->wptr) {
    1641           0 :                 rmb();
    1642           0 :                 memcpy(&entry, &data->ring[data->rptr],
    1643           0 :                                 data->element_size);
    1644             : 
    1645           0 :                 wmb();
    1646           0 :                 data->rptr = (data->aligned_element_size +
    1647           0 :                                 data->rptr) % data->ring_size;
    1648             : 
    1649           0 :                 if (amdgpu_ras_is_poison_mode_supported(obj->adev)) {
    1650           0 :                         if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
    1651           0 :                                 amdgpu_ras_interrupt_poison_creation_handler(obj, &entry);
    1652             :                         else
    1653           0 :                                 amdgpu_ras_interrupt_poison_consumption_handler(obj, &entry);
    1654             :                 } else {
    1655           0 :                         if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
    1656           0 :                                 amdgpu_ras_interrupt_umc_handler(obj, &entry);
    1657             :                         else
    1658           0 :                                 dev_warn(obj->adev->dev,
    1659             :                                         "No RAS interrupt handler for non-UMC block with poison disabled.\n");
    1660             :                 }
    1661             :         }
    1662           0 : }
    1663             : 
    1664           0 : static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
    1665             : {
    1666           0 :         struct ras_ih_data *data =
    1667           0 :                 container_of(work, struct ras_ih_data, ih_work);
    1668           0 :         struct ras_manager *obj =
    1669           0 :                 container_of(data, struct ras_manager, ih_data);
    1670             : 
    1671           0 :         amdgpu_ras_interrupt_handler(obj);
    1672           0 : }
    1673             : 
    1674           0 : int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
    1675             :                 struct ras_dispatch_if *info)
    1676             : {
    1677           0 :         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
    1678           0 :         struct ras_ih_data *data = &obj->ih_data;
    1679             : 
    1680           0 :         if (!obj)
    1681             :                 return -EINVAL;
    1682             : 
    1683           0 :         if (data->inuse == 0)
    1684             :                 return 0;
    1685             : 
    1686             :         /* Might be overflow... */
    1687           0 :         memcpy(&data->ring[data->wptr], info->entry,
    1688           0 :                         data->element_size);
    1689             : 
    1690           0 :         wmb();
    1691           0 :         data->wptr = (data->aligned_element_size +
    1692           0 :                         data->wptr) % data->ring_size;
    1693             : 
    1694           0 :         schedule_work(&data->ih_work);
    1695             : 
    1696           0 :         return 0;
    1697             : }
    1698             : 
    1699           0 : int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
    1700             :                 struct ras_common_if *head)
    1701             : {
    1702           0 :         struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
    1703             :         struct ras_ih_data *data;
    1704             : 
    1705           0 :         if (!obj)
    1706             :                 return -EINVAL;
    1707             : 
    1708           0 :         data = &obj->ih_data;
    1709           0 :         if (data->inuse == 0)
    1710             :                 return 0;
    1711             : 
    1712           0 :         cancel_work_sync(&data->ih_work);
    1713             : 
    1714           0 :         kfree(data->ring);
    1715           0 :         memset(data, 0, sizeof(*data));
    1716           0 :         put_obj(obj);
    1717             : 
    1718           0 :         return 0;
    1719             : }
    1720             : 
    1721           0 : int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
    1722             :                 struct ras_common_if *head)
    1723             : {
    1724           0 :         struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
    1725             :         struct ras_ih_data *data;
    1726             :         struct amdgpu_ras_block_object *ras_obj;
    1727             : 
    1728           0 :         if (!obj) {
    1729             :                 /* in case we registe the IH before enable ras feature */
    1730           0 :                 obj = amdgpu_ras_create_obj(adev, head);
    1731           0 :                 if (!obj)
    1732             :                         return -EINVAL;
    1733             :         } else
    1734           0 :                 get_obj(obj);
    1735             : 
    1736           0 :         ras_obj = container_of(head, struct amdgpu_ras_block_object, ras_comm);
    1737             : 
    1738           0 :         data = &obj->ih_data;
    1739             :         /* add the callback.etc */
    1740           0 :         *data = (struct ras_ih_data) {
    1741             :                 .inuse = 0,
    1742           0 :                 .cb = ras_obj->ras_cb,
    1743             :                 .element_size = sizeof(struct amdgpu_iv_entry),
    1744             :                 .rptr = 0,
    1745             :                 .wptr = 0,
    1746             :         };
    1747             : 
    1748           0 :         INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
    1749             : 
    1750           0 :         data->aligned_element_size = ALIGN(data->element_size, 8);
    1751             :         /* the ring can store 64 iv entries. */
    1752           0 :         data->ring_size = 64 * data->aligned_element_size;
    1753           0 :         data->ring = kmalloc(data->ring_size, GFP_KERNEL);
    1754           0 :         if (!data->ring) {
    1755           0 :                 put_obj(obj);
    1756           0 :                 return -ENOMEM;
    1757             :         }
    1758             : 
    1759             :         /* IH is ready */
    1760           0 :         data->inuse = 1;
    1761             : 
    1762           0 :         return 0;
    1763             : }
    1764             : 
    1765           0 : static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
    1766             : {
    1767           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    1768             :         struct ras_manager *obj, *tmp;
    1769             : 
    1770           0 :         list_for_each_entry_safe(obj, tmp, &con->head, node) {
    1771           0 :                 amdgpu_ras_interrupt_remove_handler(adev, &obj->head);
    1772             :         }
    1773             : 
    1774           0 :         return 0;
    1775             : }
    1776             : /* ih end */
    1777             : 
    1778             : /* traversal all IPs except NBIO to query error counter */
    1779           0 : static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
    1780             : {
    1781           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    1782             :         struct ras_manager *obj;
    1783             : 
    1784           0 :         if (!adev->ras_enabled || !con)
    1785             :                 return;
    1786             : 
    1787           0 :         list_for_each_entry(obj, &con->head, node) {
    1788           0 :                 struct ras_query_if info = {
    1789             :                         .head = obj->head,
    1790             :                 };
    1791             : 
    1792             :                 /*
    1793             :                  * PCIE_BIF IP has one different isr by ras controller
    1794             :                  * interrupt, the specific ras counter query will be
    1795             :                  * done in that isr. So skip such block from common
    1796             :                  * sync flood interrupt isr calling.
    1797             :                  */
    1798           0 :                 if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
    1799           0 :                         continue;
    1800             : 
    1801             :                 /*
    1802             :                  * this is a workaround for aldebaran, skip send msg to
    1803             :                  * smu to get ecc_info table due to smu handle get ecc
    1804             :                  * info table failed temporarily.
    1805             :                  * should be removed until smu fix handle ecc_info table.
    1806             :                  */
    1807           0 :                 if ((info.head.block == AMDGPU_RAS_BLOCK__UMC) &&
    1808           0 :                         (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2)))
    1809           0 :                         continue;
    1810             : 
    1811           0 :                 amdgpu_ras_query_error_status(adev, &info);
    1812             : 
    1813           0 :                 if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
    1814             :                     adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
    1815           0 :                         if (amdgpu_ras_reset_error_status(adev, info.head.block))
    1816           0 :                                 dev_warn(adev->dev, "Failed to reset error counter and error status");
    1817             :                 }
    1818             :         }
    1819             : }
    1820             : 
    1821             : /* Parse RdRspStatus and WrRspStatus */
    1822           0 : static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
    1823             :                                           struct ras_query_if *info)
    1824             : {
    1825             :         struct amdgpu_ras_block_object *block_obj;
    1826             :         /*
    1827             :          * Only two block need to query read/write
    1828             :          * RspStatus at current state
    1829             :          */
    1830           0 :         if ((info->head.block != AMDGPU_RAS_BLOCK__GFX) &&
    1831             :                 (info->head.block != AMDGPU_RAS_BLOCK__MMHUB))
    1832             :                 return;
    1833             : 
    1834           0 :         block_obj = amdgpu_ras_get_ras_block(adev,
    1835             :                                         info->head.block,
    1836             :                                         info->head.sub_block_index);
    1837             : 
    1838           0 :         if (!block_obj || !block_obj->hw_ops) {
    1839           0 :                 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
    1840             :                              get_ras_block_str(&info->head));
    1841             :                 return;
    1842             :         }
    1843             : 
    1844           0 :         if (block_obj->hw_ops->query_ras_error_status)
    1845           0 :                 block_obj->hw_ops->query_ras_error_status(adev);
    1846             : 
    1847             : }
    1848             : 
    1849           0 : static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
    1850             : {
    1851           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    1852             :         struct ras_manager *obj;
    1853             : 
    1854           0 :         if (!adev->ras_enabled || !con)
    1855             :                 return;
    1856             : 
    1857           0 :         list_for_each_entry(obj, &con->head, node) {
    1858           0 :                 struct ras_query_if info = {
    1859             :                         .head = obj->head,
    1860             :                 };
    1861             : 
    1862           0 :                 amdgpu_ras_error_status_query(adev, &info);
    1863             :         }
    1864             : }
    1865             : 
    1866             : /* recovery begin */
    1867             : 
    1868             : /* return 0 on success.
    1869             :  * caller need free bps.
    1870             :  */
    1871           0 : static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
    1872             :                 struct ras_badpage **bps, unsigned int *count)
    1873             : {
    1874           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    1875             :         struct ras_err_handler_data *data;
    1876           0 :         int i = 0;
    1877           0 :         int ret = 0, status;
    1878             : 
    1879           0 :         if (!con || !con->eh_data || !bps || !count)
    1880             :                 return -EINVAL;
    1881             : 
    1882           0 :         mutex_lock(&con->recovery_lock);
    1883           0 :         data = con->eh_data;
    1884           0 :         if (!data || data->count == 0) {
    1885           0 :                 *bps = NULL;
    1886           0 :                 ret = -EINVAL;
    1887           0 :                 goto out;
    1888             :         }
    1889             : 
    1890           0 :         *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
    1891           0 :         if (!*bps) {
    1892             :                 ret = -ENOMEM;
    1893             :                 goto out;
    1894             :         }
    1895             : 
    1896           0 :         for (; i < data->count; i++) {
    1897           0 :                 (*bps)[i] = (struct ras_badpage){
    1898           0 :                         .bp = data->bps[i].retired_page,
    1899             :                         .size = AMDGPU_GPU_PAGE_SIZE,
    1900             :                         .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
    1901             :                 };
    1902           0 :                 status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
    1903           0 :                                 data->bps[i].retired_page);
    1904           0 :                 if (status == -EBUSY)
    1905           0 :                         (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
    1906           0 :                 else if (status == -ENOENT)
    1907           0 :                         (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
    1908             :         }
    1909             : 
    1910           0 :         *count = data->count;
    1911             : out:
    1912           0 :         mutex_unlock(&con->recovery_lock);
    1913           0 :         return ret;
    1914             : }
    1915             : 
    1916           0 : static void amdgpu_ras_do_recovery(struct work_struct *work)
    1917             : {
    1918           0 :         struct amdgpu_ras *ras =
    1919           0 :                 container_of(work, struct amdgpu_ras, recovery_work);
    1920           0 :         struct amdgpu_device *remote_adev = NULL;
    1921           0 :         struct amdgpu_device *adev = ras->adev;
    1922           0 :         struct list_head device_list, *device_list_handle =  NULL;
    1923             : 
    1924           0 :         if (!ras->disable_ras_err_cnt_harvest) {
    1925           0 :                 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
    1926             : 
    1927             :                 /* Build list of devices to query RAS related errors */
    1928           0 :                 if  (hive && adev->gmc.xgmi.num_physical_nodes > 1) {
    1929           0 :                         device_list_handle = &hive->device_list;
    1930             :                 } else {
    1931           0 :                         INIT_LIST_HEAD(&device_list);
    1932           0 :                         list_add_tail(&adev->gmc.xgmi.head, &device_list);
    1933           0 :                         device_list_handle = &device_list;
    1934             :                 }
    1935             : 
    1936           0 :                 list_for_each_entry(remote_adev,
    1937             :                                 device_list_handle, gmc.xgmi.head) {
    1938           0 :                         amdgpu_ras_query_err_status(remote_adev);
    1939           0 :                         amdgpu_ras_log_on_err_counter(remote_adev);
    1940             :                 }
    1941             : 
    1942           0 :                 amdgpu_put_xgmi_hive(hive);
    1943             :         }
    1944             : 
    1945           0 :         if (amdgpu_device_should_recover_gpu(ras->adev)) {
    1946             :                 struct amdgpu_reset_context reset_context;
    1947           0 :                 memset(&reset_context, 0, sizeof(reset_context));
    1948             : 
    1949           0 :                 reset_context.method = AMD_RESET_METHOD_NONE;
    1950           0 :                 reset_context.reset_req_dev = adev;
    1951           0 :                 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
    1952           0 :                 clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
    1953             : 
    1954           0 :                 amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
    1955             :         }
    1956           0 :         atomic_set(&ras->in_recovery, 0);
    1957           0 : }
    1958             : 
    1959             : /* alloc/realloc bps array */
    1960           0 : static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
    1961             :                 struct ras_err_handler_data *data, int pages)
    1962             : {
    1963           0 :         unsigned int old_space = data->count + data->space_left;
    1964           0 :         unsigned int new_space = old_space + pages;
    1965           0 :         unsigned int align_space = ALIGN(new_space, 512);
    1966           0 :         void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
    1967             : 
    1968           0 :         if (!bps) {
    1969             :                 return -ENOMEM;
    1970             :         }
    1971             : 
    1972           0 :         if (data->bps) {
    1973           0 :                 memcpy(bps, data->bps,
    1974           0 :                                 data->count * sizeof(*data->bps));
    1975           0 :                 kfree(data->bps);
    1976             :         }
    1977             : 
    1978           0 :         data->bps = bps;
    1979           0 :         data->space_left += align_space - old_space;
    1980             :         return 0;
    1981             : }
    1982             : 
    1983             : /* it deal with vram only. */
    1984           0 : int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
    1985             :                 struct eeprom_table_record *bps, int pages)
    1986             : {
    1987           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    1988             :         struct ras_err_handler_data *data;
    1989           0 :         int ret = 0;
    1990             :         uint32_t i;
    1991             : 
    1992           0 :         if (!con || !con->eh_data || !bps || pages <= 0)
    1993             :                 return 0;
    1994             : 
    1995           0 :         mutex_lock(&con->recovery_lock);
    1996           0 :         data = con->eh_data;
    1997           0 :         if (!data)
    1998             :                 goto out;
    1999             : 
    2000           0 :         for (i = 0; i < pages; i++) {
    2001           0 :                 if (amdgpu_ras_check_bad_page_unlock(con,
    2002           0 :                         bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
    2003           0 :                         continue;
    2004             : 
    2005           0 :                 if (!data->space_left &&
    2006           0 :                         amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
    2007             :                         ret = -ENOMEM;
    2008             :                         goto out;
    2009             :                 }
    2010             : 
    2011           0 :                 amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
    2012           0 :                         bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
    2013             :                         AMDGPU_GPU_PAGE_SIZE);
    2014             : 
    2015           0 :                 memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
    2016           0 :                 data->count++;
    2017           0 :                 data->space_left--;
    2018             :         }
    2019             : out:
    2020           0 :         mutex_unlock(&con->recovery_lock);
    2021             : 
    2022           0 :         return ret;
    2023             : }
    2024             : 
    2025             : /*
    2026             :  * write error record array to eeprom, the function should be
    2027             :  * protected by recovery_lock
    2028             :  */
    2029           0 : int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
    2030             : {
    2031           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    2032             :         struct ras_err_handler_data *data;
    2033             :         struct amdgpu_ras_eeprom_control *control;
    2034             :         int save_count;
    2035             : 
    2036           0 :         if (!con || !con->eh_data)
    2037             :                 return 0;
    2038             : 
    2039           0 :         mutex_lock(&con->recovery_lock);
    2040           0 :         control = &con->eeprom_control;
    2041           0 :         data = con->eh_data;
    2042           0 :         save_count = data->count - control->ras_num_recs;
    2043           0 :         mutex_unlock(&con->recovery_lock);
    2044             :         /* only new entries are saved */
    2045           0 :         if (save_count > 0) {
    2046           0 :                 if (amdgpu_ras_eeprom_append(control,
    2047           0 :                                              &data->bps[control->ras_num_recs],
    2048             :                                              save_count)) {
    2049           0 :                         dev_err(adev->dev, "Failed to save EEPROM table data!");
    2050           0 :                         return -EIO;
    2051             :                 }
    2052             : 
    2053           0 :                 dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count);
    2054             :         }
    2055             : 
    2056             :         return 0;
    2057             : }
    2058             : 
    2059             : /*
    2060             :  * read error record array in eeprom and reserve enough space for
    2061             :  * storing new bad pages
    2062             :  */
    2063           0 : static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
    2064             : {
    2065           0 :         struct amdgpu_ras_eeprom_control *control =
    2066           0 :                 &adev->psp.ras_context.ras->eeprom_control;
    2067             :         struct eeprom_table_record *bps;
    2068             :         int ret;
    2069             : 
    2070             :         /* no bad page record, skip eeprom access */
    2071           0 :         if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0)
    2072             :                 return 0;
    2073             : 
    2074           0 :         bps = kcalloc(control->ras_num_recs, sizeof(*bps), GFP_KERNEL);
    2075           0 :         if (!bps)
    2076             :                 return -ENOMEM;
    2077             : 
    2078           0 :         ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs);
    2079           0 :         if (ret)
    2080           0 :                 dev_err(adev->dev, "Failed to load EEPROM table records!");
    2081             :         else
    2082           0 :                 ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs);
    2083             : 
    2084           0 :         kfree(bps);
    2085           0 :         return ret;
    2086             : }
    2087             : 
    2088             : static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
    2089             :                                 uint64_t addr)
    2090             : {
    2091           0 :         struct ras_err_handler_data *data = con->eh_data;
    2092             :         int i;
    2093             : 
    2094           0 :         addr >>= AMDGPU_GPU_PAGE_SHIFT;
    2095           0 :         for (i = 0; i < data->count; i++)
    2096           0 :                 if (addr == data->bps[i].retired_page)
    2097             :                         return true;
    2098             : 
    2099             :         return false;
    2100             : }
    2101             : 
    2102             : /*
    2103             :  * check if an address belongs to bad page
    2104             :  *
    2105             :  * Note: this check is only for umc block
    2106             :  */
    2107             : static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
    2108             :                                 uint64_t addr)
    2109             : {
    2110             :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    2111             :         bool ret = false;
    2112             : 
    2113             :         if (!con || !con->eh_data)
    2114             :                 return ret;
    2115             : 
    2116             :         mutex_lock(&con->recovery_lock);
    2117             :         ret = amdgpu_ras_check_bad_page_unlock(con, addr);
    2118             :         mutex_unlock(&con->recovery_lock);
    2119             :         return ret;
    2120             : }
    2121             : 
    2122             : static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
    2123             :                                           uint32_t max_count)
    2124             : {
    2125           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    2126             : 
    2127             :         /*
    2128             :          * Justification of value bad_page_cnt_threshold in ras structure
    2129             :          *
    2130             :          * Generally, -1 <= amdgpu_bad_page_threshold <= max record length
    2131             :          * in eeprom, and introduce two scenarios accordingly.
    2132             :          *
    2133             :          * Bad page retirement enablement:
    2134             :          *    - If amdgpu_bad_page_threshold = -1,
    2135             :          *      bad_page_cnt_threshold = typical value by formula.
    2136             :          *
    2137             :          *    - When the value from user is 0 < amdgpu_bad_page_threshold <
    2138             :          *      max record length in eeprom, use it directly.
    2139             :          *
    2140             :          * Bad page retirement disablement:
    2141             :          *    - If amdgpu_bad_page_threshold = 0, bad page retirement
    2142             :          *      functionality is disabled, and bad_page_cnt_threshold will
    2143             :          *      take no effect.
    2144             :          */
    2145             : 
    2146           0 :         if (amdgpu_bad_page_threshold < 0) {
    2147           0 :                 u64 val = adev->gmc.mc_vram_size;
    2148             : 
    2149           0 :                 do_div(val, RAS_BAD_PAGE_COVER);
    2150           0 :                 con->bad_page_cnt_threshold = min(lower_32_bits(val),
    2151             :                                                   max_count);
    2152             :         } else {
    2153           0 :                 con->bad_page_cnt_threshold = min_t(int, max_count,
    2154             :                                                     amdgpu_bad_page_threshold);
    2155             :         }
    2156             : }
    2157             : 
    2158           0 : int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
    2159             : {
    2160           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    2161             :         struct ras_err_handler_data **data;
    2162           0 :         u32  max_eeprom_records_count = 0;
    2163           0 :         bool exc_err_limit = false;
    2164             :         int ret;
    2165             : 
    2166           0 :         if (!con || amdgpu_sriov_vf(adev))
    2167             :                 return 0;
    2168             : 
    2169             :         /* Allow access to RAS EEPROM via debugfs, when the ASIC
    2170             :          * supports RAS and debugfs is enabled, but when
    2171             :          * adev->ras_enabled is unset, i.e. when "ras_enable"
    2172             :          * module parameter is set to 0.
    2173             :          */
    2174           0 :         con->adev = adev;
    2175             : 
    2176           0 :         if (!adev->ras_enabled)
    2177             :                 return 0;
    2178             : 
    2179           0 :         data = &con->eh_data;
    2180           0 :         *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
    2181           0 :         if (!*data) {
    2182             :                 ret = -ENOMEM;
    2183             :                 goto out;
    2184             :         }
    2185             : 
    2186           0 :         mutex_init(&con->recovery_lock);
    2187           0 :         INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
    2188           0 :         atomic_set(&con->in_recovery, 0);
    2189           0 :         con->eeprom_control.bad_channel_bitmap = 0;
    2190             : 
    2191           0 :         max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
    2192           0 :         amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
    2193             : 
    2194             :         /* Todo: During test the SMU might fail to read the eeprom through I2C
    2195             :          * when the GPU is pending on XGMI reset during probe time
    2196             :          * (Mostly after second bus reset), skip it now
    2197             :          */
    2198           0 :         if (adev->gmc.xgmi.pending_reset)
    2199             :                 return 0;
    2200           0 :         ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
    2201             :         /*
    2202             :          * This calling fails when exc_err_limit is true or
    2203             :          * ret != 0.
    2204             :          */
    2205           0 :         if (exc_err_limit || ret)
    2206             :                 goto free;
    2207             : 
    2208           0 :         if (con->eeprom_control.ras_num_recs) {
    2209           0 :                 ret = amdgpu_ras_load_bad_pages(adev);
    2210           0 :                 if (ret)
    2211             :                         goto free;
    2212             : 
    2213           0 :                 amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
    2214             : 
    2215           0 :                 if (con->update_channel_flag == true) {
    2216           0 :                         amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
    2217           0 :                         con->update_channel_flag = false;
    2218             :                 }
    2219             :         }
    2220             : 
    2221             : #ifdef CONFIG_X86_MCE_AMD
    2222             :         if ((adev->asic_type == CHIP_ALDEBARAN) &&
    2223             :             (adev->gmc.xgmi.connected_to_cpu))
    2224             :                 amdgpu_register_bad_pages_mca_notifier(adev);
    2225             : #endif
    2226             :         return 0;
    2227             : 
    2228             : free:
    2229           0 :         kfree((*data)->bps);
    2230           0 :         kfree(*data);
    2231           0 :         con->eh_data = NULL;
    2232             : out:
    2233           0 :         dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret);
    2234             : 
    2235             :         /*
    2236             :          * Except error threshold exceeding case, other failure cases in this
    2237             :          * function would not fail amdgpu driver init.
    2238             :          */
    2239           0 :         if (!exc_err_limit)
    2240             :                 ret = 0;
    2241             :         else
    2242           0 :                 ret = -EINVAL;
    2243             : 
    2244             :         return ret;
    2245             : }
    2246             : 
    2247           0 : static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
    2248             : {
    2249           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    2250           0 :         struct ras_err_handler_data *data = con->eh_data;
    2251             : 
    2252             :         /* recovery_init failed to init it, fini is useless */
    2253           0 :         if (!data)
    2254             :                 return 0;
    2255             : 
    2256           0 :         cancel_work_sync(&con->recovery_work);
    2257             : 
    2258           0 :         mutex_lock(&con->recovery_lock);
    2259           0 :         con->eh_data = NULL;
    2260           0 :         kfree(data->bps);
    2261           0 :         kfree(data);
    2262           0 :         mutex_unlock(&con->recovery_lock);
    2263             : 
    2264           0 :         return 0;
    2265             : }
    2266             : /* recovery end */
    2267             : 
    2268             : static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
    2269             : {
    2270           0 :         return adev->asic_type == CHIP_VEGA10 ||
    2271           0 :                 adev->asic_type == CHIP_VEGA20 ||
    2272           0 :                 adev->asic_type == CHIP_ARCTURUS ||
    2273           0 :                 adev->asic_type == CHIP_ALDEBARAN ||
    2274             :                 adev->asic_type == CHIP_SIENNA_CICHLID;
    2275             : }
    2276             : 
    2277             : /*
    2278             :  * this is workaround for vega20 workstation sku,
    2279             :  * force enable gfx ras, ignore vbios gfx ras flag
    2280             :  * due to GC EDC can not write
    2281             :  */
    2282           0 : static void amdgpu_ras_get_quirks(struct amdgpu_device *adev)
    2283             : {
    2284           0 :         struct atom_context *ctx = adev->mode_info.atom_context;
    2285             : 
    2286           0 :         if (!ctx)
    2287             :                 return;
    2288             : 
    2289           0 :         if (strnstr(ctx->vbios_version, "D16406",
    2290           0 :                     sizeof(ctx->vbios_version)) ||
    2291           0 :                 strnstr(ctx->vbios_version, "D36002",
    2292             :                         sizeof(ctx->vbios_version)))
    2293           0 :                 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
    2294             : }
    2295             : 
    2296             : /*
    2297             :  * check hardware's ras ability which will be saved in hw_supported.
    2298             :  * if hardware does not support ras, we can skip some ras initializtion and
    2299             :  * forbid some ras operations from IP.
    2300             :  * if software itself, say boot parameter, limit the ras ability. We still
    2301             :  * need allow IP do some limited operations, like disable. In such case,
    2302             :  * we have to initialize ras as normal. but need check if operation is
    2303             :  * allowed or not in each function.
    2304             :  */
    2305           0 : static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
    2306             : {
    2307           0 :         adev->ras_hw_enabled = adev->ras_enabled = 0;
    2308             : 
    2309           0 :         if (!adev->is_atom_fw ||
    2310           0 :             !amdgpu_ras_asic_supported(adev))
    2311             :                 return;
    2312             : 
    2313             :         /* If driver run on sriov guest side, only enable ras for aldebaran */
    2314           0 :         if (amdgpu_sriov_vf(adev) &&
    2315           0 :                 adev->ip_versions[MP1_HWIP][0] != IP_VERSION(13, 0, 2))
    2316             :                 return;
    2317             : 
    2318           0 :         if (!adev->gmc.xgmi.connected_to_cpu) {
    2319           0 :                 if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
    2320           0 :                         dev_info(adev->dev, "MEM ECC is active.\n");
    2321           0 :                         adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
    2322             :                                                    1 << AMDGPU_RAS_BLOCK__DF);
    2323             :                 } else {
    2324           0 :                         dev_info(adev->dev, "MEM ECC is not presented.\n");
    2325             :                 }
    2326             : 
    2327           0 :                 if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
    2328           0 :                         dev_info(adev->dev, "SRAM ECC is active.\n");
    2329           0 :                         if (!amdgpu_sriov_vf(adev)) {
    2330           0 :                                 adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
    2331             :                                                             1 << AMDGPU_RAS_BLOCK__DF);
    2332             : 
    2333           0 :                                 if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0))
    2334             :                                         adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
    2335             :                                                         1 << AMDGPU_RAS_BLOCK__JPEG);
    2336             :                                 else
    2337           0 :                                         adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
    2338             :                                                         1 << AMDGPU_RAS_BLOCK__JPEG);
    2339             :                         } else {
    2340           0 :                                 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF |
    2341             :                                                                 1 << AMDGPU_RAS_BLOCK__SDMA |
    2342             :                                                                 1 << AMDGPU_RAS_BLOCK__GFX);
    2343             :                         }
    2344             :                 } else {
    2345           0 :                         dev_info(adev->dev, "SRAM ECC is not presented.\n");
    2346             :                 }
    2347             :         } else {
    2348             :                 /* driver only manages a few IP blocks RAS feature
    2349             :                  * when GPU is connected cpu through XGMI */
    2350           0 :                 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX |
    2351             :                                            1 << AMDGPU_RAS_BLOCK__SDMA |
    2352             :                                            1 << AMDGPU_RAS_BLOCK__MMHUB);
    2353             :         }
    2354             : 
    2355           0 :         amdgpu_ras_get_quirks(adev);
    2356             : 
    2357             :         /* hw_supported needs to be aligned with RAS block mask. */
    2358           0 :         adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK;
    2359             : 
    2360           0 :         adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
    2361           0 :                 adev->ras_hw_enabled & amdgpu_ras_mask;
    2362             : }
    2363             : 
    2364           0 : static void amdgpu_ras_counte_dw(struct work_struct *work)
    2365             : {
    2366           0 :         struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
    2367             :                                               ras_counte_delay_work.work);
    2368           0 :         struct amdgpu_device *adev = con->adev;
    2369           0 :         struct drm_device *dev = adev_to_drm(adev);
    2370             :         unsigned long ce_count, ue_count;
    2371             :         int res;
    2372             : 
    2373           0 :         res = pm_runtime_get_sync(dev->dev);
    2374           0 :         if (res < 0)
    2375             :                 goto Out;
    2376             : 
    2377             :         /* Cache new values.
    2378             :          */
    2379           0 :         if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) {
    2380           0 :                 atomic_set(&con->ras_ce_count, ce_count);
    2381           0 :                 atomic_set(&con->ras_ue_count, ue_count);
    2382             :         }
    2383             : 
    2384           0 :         pm_runtime_mark_last_busy(dev->dev);
    2385             : Out:
    2386           0 :         pm_runtime_put_autosuspend(dev->dev);
    2387           0 : }
    2388             : 
    2389           0 : int amdgpu_ras_init(struct amdgpu_device *adev)
    2390             : {
    2391           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    2392             :         int r;
    2393             :         bool df_poison, umc_poison;
    2394             : 
    2395           0 :         if (con)
    2396             :                 return 0;
    2397             : 
    2398           0 :         con = kmalloc(sizeof(struct amdgpu_ras) +
    2399             :                         sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT +
    2400             :                         sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT,
    2401             :                         GFP_KERNEL|__GFP_ZERO);
    2402           0 :         if (!con)
    2403             :                 return -ENOMEM;
    2404             : 
    2405           0 :         con->adev = adev;
    2406           0 :         INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
    2407           0 :         atomic_set(&con->ras_ce_count, 0);
    2408           0 :         atomic_set(&con->ras_ue_count, 0);
    2409             : 
    2410           0 :         con->objs = (struct ras_manager *)(con + 1);
    2411             : 
    2412           0 :         amdgpu_ras_set_context(adev, con);
    2413             : 
    2414           0 :         amdgpu_ras_check_supported(adev);
    2415             : 
    2416           0 :         if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) {
    2417             :                 /* set gfx block ras context feature for VEGA20 Gaming
    2418             :                  * send ras disable cmd to ras ta during ras late init.
    2419             :                  */
    2420           0 :                 if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) {
    2421           0 :                         con->features |= BIT(AMDGPU_RAS_BLOCK__GFX);
    2422             : 
    2423           0 :                         return 0;
    2424             :                 }
    2425             : 
    2426             :                 r = 0;
    2427             :                 goto release_con;
    2428             :         }
    2429             : 
    2430           0 :         con->update_channel_flag = false;
    2431           0 :         con->features = 0;
    2432           0 :         INIT_LIST_HEAD(&con->head);
    2433             :         /* Might need get this flag from vbios. */
    2434           0 :         con->flags = RAS_DEFAULT_FLAGS;
    2435             : 
    2436             :         /* initialize nbio ras function ahead of any other
    2437             :          * ras functions so hardware fatal error interrupt
    2438             :          * can be enabled as early as possible */
    2439             :         switch (adev->asic_type) {
    2440             :         case CHIP_VEGA20:
    2441             :         case CHIP_ARCTURUS:
    2442             :         case CHIP_ALDEBARAN:
    2443           0 :                 if (!adev->gmc.xgmi.connected_to_cpu) {
    2444           0 :                         adev->nbio.ras = &nbio_v7_4_ras;
    2445           0 :                         amdgpu_ras_register_ras_block(adev, &adev->nbio.ras->ras_block);
    2446           0 :                         adev->nbio.ras_if = &adev->nbio.ras->ras_block.ras_comm;
    2447             :                 }
    2448             :                 break;
    2449             :         default:
    2450             :                 /* nbio ras is not available */
    2451             :                 break;
    2452             :         }
    2453             : 
    2454           0 :         if (adev->nbio.ras &&
    2455           0 :             adev->nbio.ras->init_ras_controller_interrupt) {
    2456           0 :                 r = adev->nbio.ras->init_ras_controller_interrupt(adev);
    2457           0 :                 if (r)
    2458             :                         goto release_con;
    2459             :         }
    2460             : 
    2461           0 :         if (adev->nbio.ras &&
    2462           0 :             adev->nbio.ras->init_ras_err_event_athub_interrupt) {
    2463           0 :                 r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev);
    2464           0 :                 if (r)
    2465             :                         goto release_con;
    2466             :         }
    2467             : 
    2468             :         /* Init poison supported flag, the default value is false */
    2469           0 :         if (adev->gmc.xgmi.connected_to_cpu) {
    2470             :                 /* enabled by default when GPU is connected to CPU */
    2471           0 :                 con->poison_supported = true;
    2472             :         }
    2473           0 :         else if (adev->df.funcs &&
    2474           0 :             adev->df.funcs->query_ras_poison_mode &&
    2475           0 :             adev->umc.ras &&
    2476           0 :             adev->umc.ras->query_ras_poison_mode) {
    2477           0 :                 df_poison =
    2478             :                         adev->df.funcs->query_ras_poison_mode(adev);
    2479           0 :                 umc_poison =
    2480           0 :                         adev->umc.ras->query_ras_poison_mode(adev);
    2481             :                 /* Only poison is set in both DF and UMC, we can support it */
    2482           0 :                 if (df_poison && umc_poison)
    2483           0 :                         con->poison_supported = true;
    2484           0 :                 else if (df_poison != umc_poison)
    2485           0 :                         dev_warn(adev->dev, "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
    2486             :                                         df_poison, umc_poison);
    2487             :         }
    2488             : 
    2489           0 :         if (amdgpu_ras_fs_init(adev)) {
    2490             :                 r = -EINVAL;
    2491             :                 goto release_con;
    2492             :         }
    2493             : 
    2494           0 :         dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
    2495             :                  "hardware ability[%x] ras_mask[%x]\n",
    2496             :                  adev->ras_hw_enabled, adev->ras_enabled);
    2497             : 
    2498           0 :         return 0;
    2499             : release_con:
    2500           0 :         amdgpu_ras_set_context(adev, NULL);
    2501           0 :         kfree(con);
    2502             : 
    2503           0 :         return r;
    2504             : }
    2505             : 
    2506           0 : int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
    2507             : {
    2508           0 :         if (adev->gmc.xgmi.connected_to_cpu)
    2509             :                 return 1;
    2510           0 :         return 0;
    2511             : }
    2512             : 
    2513           0 : static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
    2514             :                                         struct ras_common_if *ras_block)
    2515             : {
    2516           0 :         struct ras_query_if info = {
    2517             :                 .head = *ras_block,
    2518             :         };
    2519             : 
    2520           0 :         if (!amdgpu_persistent_edc_harvesting_supported(adev))
    2521             :                 return 0;
    2522             : 
    2523           0 :         if (amdgpu_ras_query_error_status(adev, &info) != 0)
    2524           0 :                 DRM_WARN("RAS init harvest failure");
    2525             : 
    2526           0 :         if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0)
    2527           0 :                 DRM_WARN("RAS init harvest reset failure");
    2528             : 
    2529             :         return 0;
    2530             : }
    2531             : 
    2532           0 : bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev)
    2533             : {
    2534           0 :        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    2535             : 
    2536           0 :        if (!con)
    2537             :                return false;
    2538             : 
    2539           0 :        return con->poison_supported;
    2540             : }
    2541             : 
    2542             : /* helper function to handle common stuff in ip late init phase */
    2543           0 : int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
    2544             :                          struct ras_common_if *ras_block)
    2545             : {
    2546           0 :         struct amdgpu_ras_block_object *ras_obj = NULL;
    2547           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    2548             :         unsigned long ue_count, ce_count;
    2549             :         int r;
    2550             : 
    2551             :         /* disable RAS feature per IP block if it is not supported */
    2552           0 :         if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
    2553           0 :                 amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
    2554           0 :                 return 0;
    2555             :         }
    2556             : 
    2557           0 :         r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
    2558           0 :         if (r) {
    2559           0 :                 if (adev->in_suspend || amdgpu_in_reset(adev)) {
    2560             :                         /* in resume phase, if fail to enable ras,
    2561             :                          * clean up all ras fs nodes, and disable ras */
    2562             :                         goto cleanup;
    2563             :                 } else
    2564             :                         return r;
    2565             :         }
    2566             : 
    2567             :         /* check for errors on warm reset edc persisant supported ASIC */
    2568           0 :         amdgpu_persistent_edc_harvesting(adev, ras_block);
    2569             : 
    2570             :         /* in resume phase, no need to create ras fs node */
    2571           0 :         if (adev->in_suspend || amdgpu_in_reset(adev))
    2572             :                 return 0;
    2573             : 
    2574           0 :         ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
    2575           0 :         if (ras_obj->ras_cb || (ras_obj->hw_ops &&
    2576           0 :             (ras_obj->hw_ops->query_poison_status ||
    2577           0 :             ras_obj->hw_ops->handle_poison_consumption))) {
    2578           0 :                 r = amdgpu_ras_interrupt_add_handler(adev, ras_block);
    2579           0 :                 if (r)
    2580             :                         goto cleanup;
    2581             :         }
    2582             : 
    2583           0 :         r = amdgpu_ras_sysfs_create(adev, ras_block);
    2584           0 :         if (r)
    2585             :                 goto interrupt;
    2586             : 
    2587             :         /* Those are the cached values at init.
    2588             :          */
    2589           0 :         if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) {
    2590           0 :                 atomic_set(&con->ras_ce_count, ce_count);
    2591           0 :                 atomic_set(&con->ras_ue_count, ue_count);
    2592             :         }
    2593             : 
    2594             :         return 0;
    2595             : 
    2596             : interrupt:
    2597           0 :         if (ras_obj->ras_cb)
    2598           0 :                 amdgpu_ras_interrupt_remove_handler(adev, ras_block);
    2599             : cleanup:
    2600           0 :         amdgpu_ras_feature_enable(adev, ras_block, 0);
    2601           0 :         return r;
    2602             : }
    2603             : 
    2604             : static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev,
    2605             :                          struct ras_common_if *ras_block)
    2606             : {
    2607           0 :         return amdgpu_ras_block_late_init(adev, ras_block);
    2608             : }
    2609             : 
    2610             : /* helper function to remove ras fs node and interrupt handler */
    2611           0 : void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
    2612             :                           struct ras_common_if *ras_block)
    2613             : {
    2614             :         struct amdgpu_ras_block_object *ras_obj;
    2615           0 :         if (!ras_block)
    2616             :                 return;
    2617             : 
    2618           0 :         amdgpu_ras_sysfs_remove(adev, ras_block);
    2619             : 
    2620           0 :         ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
    2621           0 :         if (ras_obj->ras_cb)
    2622           0 :                 amdgpu_ras_interrupt_remove_handler(adev, ras_block);
    2623             : }
    2624             : 
    2625             : static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev,
    2626             :                           struct ras_common_if *ras_block)
    2627             : {
    2628           0 :         return amdgpu_ras_block_late_fini(adev, ras_block);
    2629             : }
    2630             : 
    2631             : /* do some init work after IP late init as dependence.
    2632             :  * and it runs in resume/gpu reset/booting up cases.
    2633             :  */
    2634           0 : void amdgpu_ras_resume(struct amdgpu_device *adev)
    2635             : {
    2636           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    2637             :         struct ras_manager *obj, *tmp;
    2638             : 
    2639           0 :         if (!adev->ras_enabled || !con) {
    2640             :                 /* clean ras context for VEGA20 Gaming after send ras disable cmd */
    2641           0 :                 amdgpu_release_ras_context(adev);
    2642             : 
    2643           0 :                 return;
    2644             :         }
    2645             : 
    2646           0 :         if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
    2647             :                 /* Set up all other IPs which are not implemented. There is a
    2648             :                  * tricky thing that IP's actual ras error type should be
    2649             :                  * MULTI_UNCORRECTABLE, but as driver does not handle it, so
    2650             :                  * ERROR_NONE make sense anyway.
    2651             :                  */
    2652           0 :                 amdgpu_ras_enable_all_features(adev, 1);
    2653             : 
    2654             :                 /* We enable ras on all hw_supported block, but as boot
    2655             :                  * parameter might disable some of them and one or more IP has
    2656             :                  * not implemented yet. So we disable them on behalf.
    2657             :                  */
    2658           0 :                 list_for_each_entry_safe(obj, tmp, &con->head, node) {
    2659           0 :                         if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
    2660           0 :                                 amdgpu_ras_feature_enable(adev, &obj->head, 0);
    2661             :                                 /* there should be no any reference. */
    2662           0 :                                 WARN_ON(alive_obj(obj));
    2663             :                         }
    2664             :                 }
    2665             :         }
    2666             : }
    2667             : 
    2668           0 : void amdgpu_ras_suspend(struct amdgpu_device *adev)
    2669             : {
    2670           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    2671             : 
    2672           0 :         if (!adev->ras_enabled || !con)
    2673             :                 return;
    2674             : 
    2675           0 :         amdgpu_ras_disable_all_features(adev, 0);
    2676             :         /* Make sure all ras objects are disabled. */
    2677           0 :         if (con->features)
    2678           0 :                 amdgpu_ras_disable_all_features(adev, 1);
    2679             : }
    2680             : 
    2681           0 : int amdgpu_ras_late_init(struct amdgpu_device *adev)
    2682             : {
    2683             :         struct amdgpu_ras_block_list *node, *tmp;
    2684             :         struct amdgpu_ras_block_object *obj;
    2685             :         int r;
    2686             : 
    2687             :         /* Guest side doesn't need init ras feature */
    2688           0 :         if (amdgpu_sriov_vf(adev))
    2689             :                 return 0;
    2690             : 
    2691           0 :         list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
    2692           0 :                 if (!node->ras_obj) {
    2693           0 :                         dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
    2694           0 :                         continue;
    2695             :                 }
    2696             : 
    2697           0 :                 obj = node->ras_obj;
    2698           0 :                 if (obj->ras_late_init) {
    2699           0 :                         r = obj->ras_late_init(adev, &obj->ras_comm);
    2700           0 :                         if (r) {
    2701           0 :                                 dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n",
    2702             :                                         obj->ras_comm.name, r);
    2703           0 :                                 return r;
    2704             :                         }
    2705             :                 } else
    2706           0 :                         amdgpu_ras_block_late_init_default(adev, &obj->ras_comm);
    2707             :         }
    2708             : 
    2709             :         return 0;
    2710             : }
    2711             : 
    2712             : /* do some fini work before IP fini as dependence */
    2713           0 : int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
    2714             : {
    2715           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    2716             : 
    2717           0 :         if (!adev->ras_enabled || !con)
    2718             :                 return 0;
    2719             : 
    2720             : 
    2721             :         /* Need disable ras on all IPs here before ip [hw/sw]fini */
    2722           0 :         amdgpu_ras_disable_all_features(adev, 0);
    2723           0 :         amdgpu_ras_recovery_fini(adev);
    2724           0 :         return 0;
    2725             : }
    2726             : 
    2727           0 : int amdgpu_ras_fini(struct amdgpu_device *adev)
    2728             : {
    2729             :         struct amdgpu_ras_block_list *ras_node, *tmp;
    2730           0 :         struct amdgpu_ras_block_object *obj = NULL;
    2731           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    2732             : 
    2733           0 :         if (!adev->ras_enabled || !con)
    2734             :                 return 0;
    2735             : 
    2736           0 :         list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
    2737           0 :                 if (ras_node->ras_obj) {
    2738           0 :                         obj = ras_node->ras_obj;
    2739           0 :                         if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) &&
    2740           0 :                             obj->ras_fini)
    2741           0 :                                 obj->ras_fini(adev, &obj->ras_comm);
    2742             :                         else
    2743           0 :                                 amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm);
    2744             :                 }
    2745             : 
    2746             :                 /* Clear ras blocks from ras_list and free ras block list node */
    2747           0 :                 list_del(&ras_node->node);
    2748           0 :                 kfree(ras_node);
    2749             :         }
    2750             : 
    2751           0 :         amdgpu_ras_fs_fini(adev);
    2752           0 :         amdgpu_ras_interrupt_remove_all(adev);
    2753             : 
    2754           0 :         WARN(con->features, "Feature mask is not cleared");
    2755             : 
    2756           0 :         if (con->features)
    2757           0 :                 amdgpu_ras_disable_all_features(adev, 1);
    2758             : 
    2759           0 :         cancel_delayed_work_sync(&con->ras_counte_delay_work);
    2760             : 
    2761           0 :         amdgpu_ras_set_context(adev, NULL);
    2762           0 :         kfree(con);
    2763             : 
    2764           0 :         return 0;
    2765             : }
    2766             : 
    2767           0 : void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
    2768             : {
    2769           0 :         amdgpu_ras_check_supported(adev);
    2770           0 :         if (!adev->ras_hw_enabled)
    2771             :                 return;
    2772             : 
    2773           0 :         if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
    2774           0 :                 dev_info(adev->dev, "uncorrectable hardware error"
    2775             :                         "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
    2776             : 
    2777           0 :                 amdgpu_ras_reset_gpu(adev);
    2778             :         }
    2779             : }
    2780             : 
    2781           0 : bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
    2782             : {
    2783           0 :         if (adev->asic_type == CHIP_VEGA20 &&
    2784           0 :             adev->pm.fw_version <= 0x283400) {
    2785           0 :                 return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) &&
    2786             :                                 amdgpu_ras_intr_triggered();
    2787             :         }
    2788             : 
    2789             :         return false;
    2790             : }
    2791             : 
    2792           0 : void amdgpu_release_ras_context(struct amdgpu_device *adev)
    2793             : {
    2794           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    2795             : 
    2796           0 :         if (!con)
    2797             :                 return;
    2798             : 
    2799           0 :         if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) {
    2800           0 :                 con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX);
    2801           0 :                 amdgpu_ras_set_context(adev, NULL);
    2802           0 :                 kfree(con);
    2803             :         }
    2804             : }
    2805             : 
    2806             : #ifdef CONFIG_X86_MCE_AMD
    2807             : static struct amdgpu_device *find_adev(uint32_t node_id)
    2808             : {
    2809             :         int i;
    2810             :         struct amdgpu_device *adev = NULL;
    2811             : 
    2812             :         for (i = 0; i < mce_adev_list.num_gpu; i++) {
    2813             :                 adev = mce_adev_list.devs[i];
    2814             : 
    2815             :                 if (adev && adev->gmc.xgmi.connected_to_cpu &&
    2816             :                     adev->gmc.xgmi.physical_node_id == node_id)
    2817             :                         break;
    2818             :                 adev = NULL;
    2819             :         }
    2820             : 
    2821             :         return adev;
    2822             : }
    2823             : 
    2824             : #define GET_MCA_IPID_GPUID(m)   (((m) >> 44) & 0xF)
    2825             : #define GET_UMC_INST(m)         (((m) >> 21) & 0x7)
    2826             : #define GET_CHAN_INDEX(m)       ((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4))
    2827             : #define GPU_ID_OFFSET           8
    2828             : 
    2829             : static int amdgpu_bad_page_notifier(struct notifier_block *nb,
    2830             :                                     unsigned long val, void *data)
    2831             : {
    2832             :         struct mce *m = (struct mce *)data;
    2833             :         struct amdgpu_device *adev = NULL;
    2834             :         uint32_t gpu_id = 0;
    2835             :         uint32_t umc_inst = 0;
    2836             :         uint32_t ch_inst, channel_index = 0;
    2837             :         struct ras_err_data err_data = {0, 0, 0, NULL};
    2838             :         struct eeprom_table_record err_rec;
    2839             :         uint64_t retired_page;
    2840             : 
    2841             :         /*
    2842             :          * If the error was generated in UMC_V2, which belongs to GPU UMCs,
    2843             :          * and error occurred in DramECC (Extended error code = 0) then only
    2844             :          * process the error, else bail out.
    2845             :          */
    2846             :         if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) &&
    2847             :                     (XEC(m->status, 0x3f) == 0x0)))
    2848             :                 return NOTIFY_DONE;
    2849             : 
    2850             :         /*
    2851             :          * If it is correctable error, return.
    2852             :          */
    2853             :         if (mce_is_correctable(m))
    2854             :                 return NOTIFY_OK;
    2855             : 
    2856             :         /*
    2857             :          * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register.
    2858             :          */
    2859             :         gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET;
    2860             : 
    2861             :         adev = find_adev(gpu_id);
    2862             :         if (!adev) {
    2863             :                 DRM_WARN("%s: Unable to find adev for gpu_id: %d\n", __func__,
    2864             :                                                                 gpu_id);
    2865             :                 return NOTIFY_DONE;
    2866             :         }
    2867             : 
    2868             :         /*
    2869             :          * If it is uncorrectable error, then find out UMC instance and
    2870             :          * channel index.
    2871             :          */
    2872             :         umc_inst = GET_UMC_INST(m->ipid);
    2873             :         ch_inst = GET_CHAN_INDEX(m->ipid);
    2874             : 
    2875             :         dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",
    2876             :                              umc_inst, ch_inst);
    2877             : 
    2878             :         /*
    2879             :          * Translate UMC channel address to Physical address
    2880             :          */
    2881             :         channel_index =
    2882             :                 adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num
    2883             :                                           + ch_inst];
    2884             : 
    2885             :         retired_page = ADDR_OF_8KB_BLOCK(m->addr) |
    2886             :                         ADDR_OF_256B_BLOCK(channel_index) |
    2887             :                         OFFSET_IN_256B_BLOCK(m->addr);
    2888             : 
    2889             :         memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
    2890             :         err_data.err_addr = &err_rec;
    2891             :         amdgpu_umc_fill_error_record(&err_data, m->addr,
    2892             :                         retired_page, channel_index, umc_inst);
    2893             : 
    2894             :         if (amdgpu_bad_page_threshold != 0) {
    2895             :                 amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
    2896             :                                                 err_data.err_addr_cnt);
    2897             :                 amdgpu_ras_save_bad_pages(adev);
    2898             :         }
    2899             : 
    2900             :         return NOTIFY_OK;
    2901             : }
    2902             : 
    2903             : static struct notifier_block amdgpu_bad_page_nb = {
    2904             :         .notifier_call  = amdgpu_bad_page_notifier,
    2905             :         .priority       = MCE_PRIO_UC,
    2906             : };
    2907             : 
    2908             : static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev)
    2909             : {
    2910             :         /*
    2911             :          * Add the adev to the mce_adev_list.
    2912             :          * During mode2 reset, amdgpu device is temporarily
    2913             :          * removed from the mgpu_info list which can cause
    2914             :          * page retirement to fail.
    2915             :          * Use this list instead of mgpu_info to find the amdgpu
    2916             :          * device on which the UMC error was reported.
    2917             :          */
    2918             :         mce_adev_list.devs[mce_adev_list.num_gpu++] = adev;
    2919             : 
    2920             :         /*
    2921             :          * Register the x86 notifier only once
    2922             :          * with MCE subsystem.
    2923             :          */
    2924             :         if (notifier_registered == false) {
    2925             :                 mce_register_decode_chain(&amdgpu_bad_page_nb);
    2926             :                 notifier_registered = true;
    2927             :         }
    2928             : }
    2929             : #endif
    2930             : 
    2931           0 : struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev)
    2932             : {
    2933           0 :         if (!adev)
    2934             :                 return NULL;
    2935             : 
    2936           0 :         return adev->psp.ras_context.ras;
    2937             : }
    2938             : 
    2939           0 : int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con)
    2940             : {
    2941           0 :         if (!adev)
    2942             :                 return -EINVAL;
    2943             : 
    2944           0 :         adev->psp.ras_context.ras = ras_con;
    2945           0 :         return 0;
    2946             : }
    2947             : 
    2948             : /* check if ras is supported on block, say, sdma, gfx */
    2949           0 : int amdgpu_ras_is_supported(struct amdgpu_device *adev,
    2950             :                 unsigned int block)
    2951             : {
    2952           0 :         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
    2953             : 
    2954           0 :         if (block >= AMDGPU_RAS_BLOCK_COUNT)
    2955             :                 return 0;
    2956           0 :         return ras && (adev->ras_enabled & (1 << block));
    2957             : }
    2958             : 
    2959           0 : int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
    2960             : {
    2961           0 :         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
    2962             : 
    2963           0 :         if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
    2964           0 :                 amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work);
    2965           0 :         return 0;
    2966             : }
    2967             : 
    2968             : 
    2969             : /* Register each ip ras block into amdgpu ras */
    2970           0 : int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
    2971             :                 struct amdgpu_ras_block_object *ras_block_obj)
    2972             : {
    2973             :         struct amdgpu_ras_block_list *ras_node;
    2974           0 :         if (!adev || !ras_block_obj)
    2975             :                 return -EINVAL;
    2976             : 
    2977           0 :         if (!amdgpu_ras_asic_supported(adev))
    2978             :                 return 0;
    2979             : 
    2980           0 :         ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL);
    2981           0 :         if (!ras_node)
    2982             :                 return -ENOMEM;
    2983             : 
    2984           0 :         INIT_LIST_HEAD(&ras_node->node);
    2985           0 :         ras_node->ras_obj = ras_block_obj;
    2986           0 :         list_add_tail(&ras_node->node, &adev->ras_list);
    2987             : 
    2988           0 :         return 0;
    2989             : }

Generated by: LCOV version 1.14