LCOV - code coverage report
Current view: top level - drivers/gpu/drm/amd/amdgpu - amdgpu_xgmi.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 0 322 0.0 %
Date: 2022-12-09 01:23:36 Functions: 0 21 0.0 %

          Line data    Source code
       1             : /*
       2             :  * Copyright 2018 Advanced Micro Devices, Inc.
       3             :  *
       4             :  * Permission is hereby granted, free of charge, to any person obtaining a
       5             :  * copy of this software and associated documentation files (the "Software"),
       6             :  * to deal in the Software without restriction, including without limitation
       7             :  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
       8             :  * and/or sell copies of the Software, and to permit persons to whom the
       9             :  * Software is furnished to do so, subject to the following conditions:
      10             :  *
      11             :  * The above copyright notice and this permission notice shall be included in
      12             :  * all copies or substantial portions of the Software.
      13             :  *
      14             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
      15             :  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      16             :  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
      17             :  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
      18             :  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
      19             :  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
      20             :  * OTHER DEALINGS IN THE SOFTWARE.
      21             :  *
      22             :  *
      23             :  */
      24             : #include <linux/list.h>
      25             : #include "amdgpu.h"
      26             : #include "amdgpu_xgmi.h"
      27             : #include "amdgpu_ras.h"
      28             : #include "soc15.h"
      29             : #include "df/df_3_6_offset.h"
      30             : #include "xgmi/xgmi_4_0_0_smn.h"
      31             : #include "xgmi/xgmi_4_0_0_sh_mask.h"
      32             : #include "wafl/wafl2_4_0_0_smn.h"
      33             : #include "wafl/wafl2_4_0_0_sh_mask.h"
      34             : 
      35             : #include "amdgpu_reset.h"
      36             : 
      37             : #define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c
      38             : #define smnPCS_GOPX1_PCS_ERROR_STATUS    0x12200210
      39             : 
      40             : static DEFINE_MUTEX(xgmi_mutex);
      41             : 
      42             : #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE         4
      43             : 
      44             : static LIST_HEAD(xgmi_hive_list);
      45             : 
      46             : static const int xgmi_pcs_err_status_reg_vg20[] = {
      47             :         smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
      48             :         smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
      49             : };
      50             : 
      51             : static const int wafl_pcs_err_status_reg_vg20[] = {
      52             :         smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
      53             :         smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
      54             : };
      55             : 
      56             : static const int xgmi_pcs_err_status_reg_arct[] = {
      57             :         smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
      58             :         smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
      59             :         smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x500000,
      60             :         smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x600000,
      61             :         smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x700000,
      62             :         smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x800000,
      63             : };
      64             : 
      65             : /* same as vg20*/
      66             : static const int wafl_pcs_err_status_reg_arct[] = {
      67             :         smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
      68             :         smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
      69             : };
      70             : 
      71             : static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = {
      72             :         smnPCS_XGMI3X16_PCS_ERROR_STATUS,
      73             :         smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000,
      74             :         smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x200000,
      75             :         smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x300000,
      76             :         smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x400000,
      77             :         smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x500000,
      78             :         smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x600000,
      79             :         smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x700000
      80             : };
      81             : 
      82             : static const int walf_pcs_err_status_reg_aldebaran[] = {
      83             :         smnPCS_GOPX1_PCS_ERROR_STATUS,
      84             :         smnPCS_GOPX1_PCS_ERROR_STATUS + 0x100000
      85             : };
      86             : 
      87             : static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
      88             :         {"XGMI PCS DataLossErr",
      89             :          SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
      90             :         {"XGMI PCS TrainingErr",
      91             :          SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)},
      92             :         {"XGMI PCS CRCErr",
      93             :          SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)},
      94             :         {"XGMI PCS BERExceededErr",
      95             :          SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)},
      96             :         {"XGMI PCS TxMetaDataErr",
      97             :          SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)},
      98             :         {"XGMI PCS ReplayBufParityErr",
      99             :          SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)},
     100             :         {"XGMI PCS DataParityErr",
     101             :          SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)},
     102             :         {"XGMI PCS ReplayFifoOverflowErr",
     103             :          SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
     104             :         {"XGMI PCS ReplayFifoUnderflowErr",
     105             :          SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
     106             :         {"XGMI PCS ElasticFifoOverflowErr",
     107             :          SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
     108             :         {"XGMI PCS DeskewErr",
     109             :          SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)},
     110             :         {"XGMI PCS DataStartupLimitErr",
     111             :          SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)},
     112             :         {"XGMI PCS FCInitTimeoutErr",
     113             :          SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)},
     114             :         {"XGMI PCS RecoveryTimeoutErr",
     115             :          SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
     116             :         {"XGMI PCS ReadySerialTimeoutErr",
     117             :          SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
     118             :         {"XGMI PCS ReadySerialAttemptErr",
     119             :          SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
     120             :         {"XGMI PCS RecoveryAttemptErr",
     121             :          SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)},
     122             :         {"XGMI PCS RecoveryRelockAttemptErr",
     123             :          SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
     124             : };
     125             : 
     126             : static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = {
     127             :         {"WAFL PCS DataLossErr",
     128             :          SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)},
     129             :         {"WAFL PCS TrainingErr",
     130             :          SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)},
     131             :         {"WAFL PCS CRCErr",
     132             :          SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)},
     133             :         {"WAFL PCS BERExceededErr",
     134             :          SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)},
     135             :         {"WAFL PCS TxMetaDataErr",
     136             :          SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)},
     137             :         {"WAFL PCS ReplayBufParityErr",
     138             :          SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)},
     139             :         {"WAFL PCS DataParityErr",
     140             :          SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)},
     141             :         {"WAFL PCS ReplayFifoOverflowErr",
     142             :          SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
     143             :         {"WAFL PCS ReplayFifoUnderflowErr",
     144             :          SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
     145             :         {"WAFL PCS ElasticFifoOverflowErr",
     146             :          SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
     147             :         {"WAFL PCS DeskewErr",
     148             :          SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)},
     149             :         {"WAFL PCS DataStartupLimitErr",
     150             :          SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)},
     151             :         {"WAFL PCS FCInitTimeoutErr",
     152             :          SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)},
     153             :         {"WAFL PCS RecoveryTimeoutErr",
     154             :          SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
     155             :         {"WAFL PCS ReadySerialTimeoutErr",
     156             :          SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
     157             :         {"WAFL PCS ReadySerialAttemptErr",
     158             :          SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
     159             :         {"WAFL PCS RecoveryAttemptErr",
     160             :          SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)},
     161             :         {"WAFL PCS RecoveryRelockAttemptErr",
     162             :          SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
     163             : };
     164             : 
     165             : /**
     166             :  * DOC: AMDGPU XGMI Support
     167             :  *
     168             :  * XGMI is a high speed interconnect that joins multiple GPU cards
     169             :  * into a homogeneous memory space that is organized by a collective
     170             :  * hive ID and individual node IDs, both of which are 64-bit numbers.
     171             :  *
     172             :  * The file xgmi_device_id contains the unique per GPU device ID and
     173             :  * is stored in the /sys/class/drm/card${cardno}/device/ directory.
     174             :  *
     175             :  * Inside the device directory a sub-directory 'xgmi_hive_info' is
     176             :  * created which contains the hive ID and the list of nodes.
     177             :  *
     178             :  * The hive ID is stored in:
     179             :  *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id
     180             :  *
     181             :  * The node information is stored in numbered directories:
     182             :  *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id
     183             :  *
     184             :  * Each device has their own xgmi_hive_info direction with a mirror
     185             :  * set of node sub-directories.
     186             :  *
     187             :  * The XGMI memory space is built by contiguously adding the power of
     188             :  * two padded VRAM space from each node to each other.
     189             :  *
     190             :  */
     191             : 
     192             : static struct attribute amdgpu_xgmi_hive_id = {
     193             :         .name = "xgmi_hive_id",
     194             :         .mode = S_IRUGO
     195             : };
     196             : 
     197             : static struct attribute *amdgpu_xgmi_hive_attrs[] = {
     198             :         &amdgpu_xgmi_hive_id,
     199             :         NULL
     200             : };
     201             : ATTRIBUTE_GROUPS(amdgpu_xgmi_hive);
     202             : 
     203           0 : static ssize_t amdgpu_xgmi_show_attrs(struct kobject *kobj,
     204             :         struct attribute *attr, char *buf)
     205             : {
     206           0 :         struct amdgpu_hive_info *hive = container_of(
     207             :                 kobj, struct amdgpu_hive_info, kobj);
     208             : 
     209           0 :         if (attr == &amdgpu_xgmi_hive_id)
     210           0 :                 return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
     211             : 
     212             :         return 0;
     213             : }
     214             : 
     215           0 : static void amdgpu_xgmi_hive_release(struct kobject *kobj)
     216             : {
     217           0 :         struct amdgpu_hive_info *hive = container_of(
     218             :                 kobj, struct amdgpu_hive_info, kobj);
     219             : 
     220           0 :         amdgpu_reset_put_reset_domain(hive->reset_domain);
     221           0 :         hive->reset_domain = NULL;
     222             : 
     223           0 :         mutex_destroy(&hive->hive_lock);
     224           0 :         kfree(hive);
     225           0 : }
     226             : 
     227             : static const struct sysfs_ops amdgpu_xgmi_hive_ops = {
     228             :         .show = amdgpu_xgmi_show_attrs,
     229             : };
     230             : 
     231             : struct kobj_type amdgpu_xgmi_hive_type = {
     232             :         .release = amdgpu_xgmi_hive_release,
     233             :         .sysfs_ops = &amdgpu_xgmi_hive_ops,
     234             :         .default_groups = amdgpu_xgmi_hive_groups,
     235             : };
     236             : 
     237           0 : static ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
     238             :                                      struct device_attribute *attr,
     239             :                                      char *buf)
     240             : {
     241           0 :         struct drm_device *ddev = dev_get_drvdata(dev);
     242           0 :         struct amdgpu_device *adev = drm_to_adev(ddev);
     243             : 
     244           0 :         return sysfs_emit(buf, "%llu\n", adev->gmc.xgmi.node_id);
     245             : 
     246             : }
     247             : 
     248             : #define AMDGPU_XGMI_SET_FICAA(o)        ((o) | 0x456801)
     249           0 : static ssize_t amdgpu_xgmi_show_error(struct device *dev,
     250             :                                       struct device_attribute *attr,
     251             :                                       char *buf)
     252             : {
     253           0 :         struct drm_device *ddev = dev_get_drvdata(dev);
     254           0 :         struct amdgpu_device *adev = drm_to_adev(ddev);
     255             :         uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in;
     256             :         uint64_t fica_out;
     257           0 :         unsigned int error_count = 0;
     258             : 
     259           0 :         ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200);
     260           0 :         ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208);
     261             : 
     262           0 :         if ((!adev->df.funcs) ||
     263           0 :             (!adev->df.funcs->get_fica) ||
     264           0 :             (!adev->df.funcs->set_fica))
     265             :                 return -EINVAL;
     266             : 
     267           0 :         fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in);
     268           0 :         if (fica_out != 0x1f)
     269           0 :                 pr_err("xGMI error counters not enabled!\n");
     270             : 
     271           0 :         fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in);
     272             : 
     273           0 :         if ((fica_out & 0xffff) == 2)
     274           0 :                 error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63);
     275             : 
     276           0 :         adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0);
     277             : 
     278           0 :         return sysfs_emit(buf, "%u\n", error_count);
     279             : }
     280             : 
     281             : 
     282             : static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
     283             : static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL);
     284             : 
     285           0 : static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
     286             :                                          struct amdgpu_hive_info *hive)
     287             : {
     288           0 :         int ret = 0;
     289           0 :         char node[10] = { 0 };
     290             : 
     291             :         /* Create xgmi device id file */
     292           0 :         ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id);
     293           0 :         if (ret) {
     294           0 :                 dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n");
     295           0 :                 return ret;
     296             :         }
     297             : 
     298             :         /* Create xgmi error file */
     299           0 :         ret = device_create_file(adev->dev, &dev_attr_xgmi_error);
     300           0 :         if (ret)
     301           0 :                 pr_err("failed to create xgmi_error\n");
     302             : 
     303             : 
     304             :         /* Create sysfs link to hive info folder on the first device */
     305           0 :         if (hive->kobj.parent != (&adev->dev->kobj)) {
     306           0 :                 ret = sysfs_create_link(&adev->dev->kobj, &hive->kobj,
     307             :                                         "xgmi_hive_info");
     308           0 :                 if (ret) {
     309           0 :                         dev_err(adev->dev, "XGMI: Failed to create link to hive info");
     310           0 :                         goto remove_file;
     311             :                 }
     312             :         }
     313             : 
     314           0 :         sprintf(node, "node%d", atomic_read(&hive->number_devices));
     315             :         /* Create sysfs link form the hive folder to yourself */
     316           0 :         ret = sysfs_create_link(&hive->kobj, &adev->dev->kobj, node);
     317           0 :         if (ret) {
     318           0 :                 dev_err(adev->dev, "XGMI: Failed to create link from hive info");
     319             :                 goto remove_link;
     320             :         }
     321             : 
     322             :         goto success;
     323             : 
     324             : 
     325             : remove_link:
     326           0 :         sysfs_remove_link(&adev->dev->kobj, adev_to_drm(adev)->unique);
     327             : 
     328             : remove_file:
     329           0 :         device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
     330             : 
     331             : success:
     332             :         return ret;
     333             : }
     334             : 
     335           0 : static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
     336             :                                           struct amdgpu_hive_info *hive)
     337             : {
     338             :         char node[10];
     339           0 :         memset(node, 0, sizeof(node));
     340             : 
     341           0 :         device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
     342           0 :         device_remove_file(adev->dev, &dev_attr_xgmi_error);
     343             : 
     344           0 :         if (hive->kobj.parent != (&adev->dev->kobj))
     345           0 :                 sysfs_remove_link(&adev->dev->kobj,"xgmi_hive_info");
     346             : 
     347           0 :         sprintf(node, "node%d", atomic_read(&hive->number_devices));
     348           0 :         sysfs_remove_link(&hive->kobj, node);
     349             : 
     350           0 : }
     351             : 
     352             : 
     353             : 
     354           0 : struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
     355             : {
     356           0 :         struct amdgpu_hive_info *hive = NULL;
     357             :         int ret;
     358             : 
     359           0 :         if (!adev->gmc.xgmi.hive_id)
     360             :                 return NULL;
     361             : 
     362           0 :         if (adev->hive) {
     363           0 :                 kobject_get(&adev->hive->kobj);
     364           0 :                 return adev->hive;
     365             :         }
     366             : 
     367           0 :         mutex_lock(&xgmi_mutex);
     368             : 
     369           0 :         list_for_each_entry(hive, &xgmi_hive_list, node)  {
     370           0 :                 if (hive->hive_id == adev->gmc.xgmi.hive_id)
     371             :                         goto pro_end;
     372             :         }
     373             : 
     374           0 :         hive = kzalloc(sizeof(*hive), GFP_KERNEL);
     375           0 :         if (!hive) {
     376           0 :                 dev_err(adev->dev, "XGMI: allocation failed\n");
     377           0 :                 hive = NULL;
     378           0 :                 goto pro_end;
     379             :         }
     380             : 
     381             :         /* initialize new hive if not exist */
     382           0 :         ret = kobject_init_and_add(&hive->kobj,
     383             :                         &amdgpu_xgmi_hive_type,
     384           0 :                         &adev->dev->kobj,
     385             :                         "%s", "xgmi_hive_info");
     386           0 :         if (ret) {
     387           0 :                 dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi hive\n");
     388           0 :                 kobject_put(&hive->kobj);
     389           0 :                 kfree(hive);
     390           0 :                 hive = NULL;
     391           0 :                 goto pro_end;
     392             :         }
     393             : 
     394             :         /**
     395             :          * Only init hive->reset_domain for none SRIOV configuration. For SRIOV,
     396             :          * Host driver decide how to reset the GPU either through FLR or chain reset.
     397             :          * Guest side will get individual notifications from the host for the FLR
     398             :          * if necessary.
     399             :          */
     400           0 :         if (!amdgpu_sriov_vf(adev)) {
     401             :         /**
     402             :          * Avoid recreating reset domain when hive is reconstructed for the case
     403             :          * of reset the devices in the XGMI hive during probe for passthrough GPU
     404             :          * See https://www.spinics.net/lists/amd-gfx/msg58836.html
     405             :          */
     406           0 :                 if (adev->reset_domain->type != XGMI_HIVE) {
     407           0 :                         hive->reset_domain =
     408           0 :                                 amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive");
     409           0 :                                 if (!hive->reset_domain) {
     410           0 :                                         dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n");
     411           0 :                                         ret = -ENOMEM;
     412           0 :                                         kobject_put(&hive->kobj);
     413           0 :                                         kfree(hive);
     414           0 :                                         hive = NULL;
     415           0 :                                         goto pro_end;
     416             :                                 }
     417             :                 } else {
     418           0 :                         amdgpu_reset_get_reset_domain(adev->reset_domain);
     419           0 :                         hive->reset_domain = adev->reset_domain;
     420             :                 }
     421             :         }
     422             : 
     423           0 :         hive->hive_id = adev->gmc.xgmi.hive_id;
     424           0 :         INIT_LIST_HEAD(&hive->device_list);
     425           0 :         INIT_LIST_HEAD(&hive->node);
     426           0 :         mutex_init(&hive->hive_lock);
     427           0 :         atomic_set(&hive->number_devices, 0);
     428           0 :         task_barrier_init(&hive->tb);
     429           0 :         hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
     430           0 :         hive->hi_req_gpu = NULL;
     431             : 
     432             :         /*
     433             :          * hive pstate on boot is high in vega20 so we have to go to low
     434             :          * pstate on after boot.
     435             :          */
     436           0 :         hive->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE;
     437           0 :         list_add_tail(&hive->node, &xgmi_hive_list);
     438             : 
     439             : pro_end:
     440           0 :         if (hive)
     441           0 :                 kobject_get(&hive->kobj);
     442           0 :         mutex_unlock(&xgmi_mutex);
     443           0 :         return hive;
     444             : }
     445             : 
     446           0 : void amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive)
     447             : {
     448           0 :         if (hive)
     449           0 :                 kobject_put(&hive->kobj);
     450           0 : }
     451             : 
     452           0 : int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
     453             : {
     454           0 :         int ret = 0;
     455             :         struct amdgpu_hive_info *hive;
     456             :         struct amdgpu_device *request_adev;
     457           0 :         bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20;
     458             :         bool init_low;
     459             : 
     460           0 :         hive = amdgpu_get_xgmi_hive(adev);
     461           0 :         if (!hive)
     462             :                 return 0;
     463             : 
     464           0 :         request_adev = hive->hi_req_gpu ? hive->hi_req_gpu : adev;
     465           0 :         init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN;
     466           0 :         amdgpu_put_xgmi_hive(hive);
     467             :         /* fw bug so temporarily disable pstate switching */
     468           0 :         return 0;
     469             : 
     470             :         if (!hive || adev->asic_type != CHIP_VEGA20)
     471             :                 return 0;
     472             : 
     473             :         mutex_lock(&hive->hive_lock);
     474             : 
     475             :         if (is_hi_req)
     476             :                 hive->hi_req_count++;
     477             :         else
     478             :                 hive->hi_req_count--;
     479             : 
     480             :         /*
     481             :          * Vega20 only needs single peer to request pstate high for the hive to
     482             :          * go high but all peers must request pstate low for the hive to go low
     483             :          */
     484             :         if (hive->pstate == pstate ||
     485             :                         (!is_hi_req && hive->hi_req_count && !init_low))
     486             :                 goto out;
     487             : 
     488             :         dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n", pstate);
     489             : 
     490             :         ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate);
     491             :         if (ret) {
     492             :                 dev_err(request_adev->dev,
     493             :                         "XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
     494             :                         request_adev->gmc.xgmi.node_id,
     495             :                         request_adev->gmc.xgmi.hive_id, ret);
     496             :                 goto out;
     497             :         }
     498             : 
     499             :         if (init_low)
     500             :                 hive->pstate = hive->hi_req_count ?
     501             :                                         hive->pstate : AMDGPU_XGMI_PSTATE_MIN;
     502             :         else {
     503             :                 hive->pstate = pstate;
     504             :                 hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ?
     505             :                                                         adev : NULL;
     506             :         }
     507             : out:
     508             :         mutex_unlock(&hive->hive_lock);
     509             :         return ret;
     510             : }
     511             : 
     512           0 : int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)
     513             : {
     514             :         int ret;
     515             : 
     516           0 :         if (amdgpu_sriov_vf(adev))
     517             :                 return 0;
     518             : 
     519             :         /* Each psp need to set the latest topology */
     520           0 :         ret = psp_xgmi_set_topology_info(&adev->psp,
     521           0 :                                          atomic_read(&hive->number_devices),
     522             :                                          &adev->psp.xgmi_context.top_info);
     523           0 :         if (ret)
     524           0 :                 dev_err(adev->dev,
     525             :                         "XGMI: Set topology failure on device %llx, hive %llx, ret %d",
     526             :                         adev->gmc.xgmi.node_id,
     527             :                         adev->gmc.xgmi.hive_id, ret);
     528             : 
     529             :         return ret;
     530             : }
     531             : 
     532             : 
     533             : /*
     534             :  * NOTE psp_xgmi_node_info.num_hops layout is as follows:
     535             :  * num_hops[7:6] = link type (0 = xGMI2, 1 = xGMI3, 2/3 = reserved)
     536             :  * num_hops[5:3] = reserved
     537             :  * num_hops[2:0] = number of hops
     538             :  */
     539           0 : int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
     540             :                 struct amdgpu_device *peer_adev)
     541             : {
     542           0 :         struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
     543           0 :         uint8_t num_hops_mask = 0x7;
     544             :         int i;
     545             : 
     546           0 :         for (i = 0 ; i < top->num_nodes; ++i)
     547           0 :                 if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
     548           0 :                         return top->nodes[i].num_hops & num_hops_mask;
     549             :         return  -EINVAL;
     550             : }
     551             : 
     552           0 : int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
     553             :                 struct amdgpu_device *peer_adev)
     554             : {
     555           0 :         struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
     556             :         int i;
     557             : 
     558           0 :         for (i = 0 ; i < top->num_nodes; ++i)
     559           0 :                 if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
     560           0 :                         return top->nodes[i].num_links;
     561             :         return  -EINVAL;
     562             : }
     563             : 
     564             : /*
     565             :  * Devices that support extended data require the entire hive to initialize with
     566             :  * the shared memory buffer flag set.
     567             :  *
     568             :  * Hive locks and conditions apply - see amdgpu_xgmi_add_device
     569             :  */
     570           0 : static int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_info *hive,
     571             :                                                         bool set_extended_data)
     572             : {
     573             :         struct amdgpu_device *tmp_adev;
     574             :         int ret;
     575             : 
     576           0 :         list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
     577           0 :                 ret = psp_xgmi_initialize(&tmp_adev->psp, set_extended_data, false);
     578           0 :                 if (ret) {
     579           0 :                         dev_err(tmp_adev->dev,
     580             :                                 "XGMI: Failed to initialize xgmi session for data partition %i\n",
     581             :                                 set_extended_data);
     582           0 :                         return ret;
     583             :                 }
     584             : 
     585             :         }
     586             : 
     587             :         return 0;
     588             : }
     589             : 
     590           0 : int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
     591             : {
     592             :         struct psp_xgmi_topology_info *top_info;
     593             :         struct amdgpu_hive_info *hive;
     594             :         struct amdgpu_xgmi      *entry;
     595           0 :         struct amdgpu_device *tmp_adev = NULL;
     596             : 
     597           0 :         int count = 0, ret = 0;
     598             : 
     599           0 :         if (!adev->gmc.xgmi.supported)
     600             :                 return 0;
     601             : 
     602           0 :         if (!adev->gmc.xgmi.pending_reset &&
     603           0 :             amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
     604           0 :                 ret = psp_xgmi_initialize(&adev->psp, false, true);
     605           0 :                 if (ret) {
     606           0 :                         dev_err(adev->dev,
     607             :                                 "XGMI: Failed to initialize xgmi session\n");
     608           0 :                         return ret;
     609             :                 }
     610             : 
     611           0 :                 ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id);
     612           0 :                 if (ret) {
     613           0 :                         dev_err(adev->dev,
     614             :                                 "XGMI: Failed to get hive id\n");
     615           0 :                         return ret;
     616             :                 }
     617             : 
     618           0 :                 ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id);
     619           0 :                 if (ret) {
     620           0 :                         dev_err(adev->dev,
     621             :                                 "XGMI: Failed to get node id\n");
     622           0 :                         return ret;
     623             :                 }
     624             :         } else {
     625           0 :                 adev->gmc.xgmi.hive_id = 16;
     626           0 :                 adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16;
     627             :         }
     628             : 
     629           0 :         hive = amdgpu_get_xgmi_hive(adev);
     630           0 :         if (!hive) {
     631           0 :                 ret = -EINVAL;
     632           0 :                 dev_err(adev->dev,
     633             :                         "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n",
     634             :                         adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id);
     635           0 :                 goto exit;
     636             :         }
     637           0 :         mutex_lock(&hive->hive_lock);
     638             : 
     639           0 :         top_info = &adev->psp.xgmi_context.top_info;
     640             : 
     641           0 :         list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
     642           0 :         list_for_each_entry(entry, &hive->device_list, head)
     643           0 :                 top_info->nodes[count++].node_id = entry->node_id;
     644           0 :         top_info->num_nodes = count;
     645           0 :         atomic_set(&hive->number_devices, count);
     646             : 
     647           0 :         task_barrier_add_task(&hive->tb);
     648             : 
     649           0 :         if (!adev->gmc.xgmi.pending_reset &&
     650           0 :             amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
     651           0 :                 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
     652             :                         /* update node list for other device in the hive */
     653           0 :                         if (tmp_adev != adev) {
     654           0 :                                 top_info = &tmp_adev->psp.xgmi_context.top_info;
     655           0 :                                 top_info->nodes[count - 1].node_id =
     656           0 :                                         adev->gmc.xgmi.node_id;
     657           0 :                                 top_info->num_nodes = count;
     658             :                         }
     659           0 :                         ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
     660           0 :                         if (ret)
     661             :                                 goto exit_unlock;
     662             :                 }
     663             : 
     664             :                 /* get latest topology info for each device from psp */
     665           0 :                 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
     666           0 :                         ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
     667             :                                         &tmp_adev->psp.xgmi_context.top_info, false);
     668           0 :                         if (ret) {
     669           0 :                                 dev_err(tmp_adev->dev,
     670             :                                         "XGMI: Get topology failure on device %llx, hive %llx, ret %d",
     671             :                                         tmp_adev->gmc.xgmi.node_id,
     672             :                                         tmp_adev->gmc.xgmi.hive_id, ret);
     673             :                                 /* To do : continue with some node failed or disable the whole hive */
     674           0 :                                 goto exit_unlock;
     675             :                         }
     676             :                 }
     677             : 
     678             :                 /* get topology again for hives that support extended data */
     679           0 :                 if (adev->psp.xgmi_context.supports_extended_data) {
     680             : 
     681             :                         /* initialize the hive to get extended data.  */
     682           0 :                         ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, true);
     683           0 :                         if (ret)
     684             :                                 goto exit_unlock;
     685             : 
     686             :                         /* get the extended data. */
     687           0 :                         list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
     688           0 :                                 ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
     689             :                                                 &tmp_adev->psp.xgmi_context.top_info, true);
     690           0 :                                 if (ret) {
     691           0 :                                         dev_err(tmp_adev->dev,
     692             :                                                 "XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d",
     693             :                                                 tmp_adev->gmc.xgmi.node_id,
     694             :                                                 tmp_adev->gmc.xgmi.hive_id, ret);
     695           0 :                                         goto exit_unlock;
     696             :                                 }
     697             :                         }
     698             : 
     699             :                         /* initialize the hive to get non-extended data for the next round. */
     700           0 :                         ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, false);
     701           0 :                         if (ret)
     702             :                                 goto exit_unlock;
     703             : 
     704             :                 }
     705             :         }
     706             : 
     707           0 :         if (!ret && !adev->gmc.xgmi.pending_reset)
     708           0 :                 ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
     709             : 
     710             : exit_unlock:
     711           0 :         mutex_unlock(&hive->hive_lock);
     712             : exit:
     713           0 :         if (!ret) {
     714           0 :                 adev->hive = hive;
     715           0 :                 dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
     716             :                          adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id);
     717             :         } else {
     718           0 :                 amdgpu_put_xgmi_hive(hive);
     719           0 :                 dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n",
     720             :                         adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id,
     721             :                         ret);
     722             :         }
     723             : 
     724             :         return ret;
     725             : }
     726             : 
     727           0 : int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
     728             : {
     729           0 :         struct amdgpu_hive_info *hive = adev->hive;
     730             : 
     731           0 :         if (!adev->gmc.xgmi.supported)
     732             :                 return -EINVAL;
     733             : 
     734           0 :         if (!hive)
     735             :                 return -EINVAL;
     736             : 
     737           0 :         mutex_lock(&hive->hive_lock);
     738           0 :         task_barrier_rem_task(&hive->tb);
     739           0 :         amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
     740           0 :         if (hive->hi_req_gpu == adev)
     741           0 :                 hive->hi_req_gpu = NULL;
     742           0 :         list_del(&adev->gmc.xgmi.head);
     743           0 :         mutex_unlock(&hive->hive_lock);
     744             : 
     745           0 :         amdgpu_put_xgmi_hive(hive);
     746           0 :         adev->hive = NULL;
     747             : 
     748           0 :         if (atomic_dec_return(&hive->number_devices) == 0) {
     749             :                 /* Remove the hive from global hive list */
     750           0 :                 mutex_lock(&xgmi_mutex);
     751           0 :                 list_del(&hive->node);
     752           0 :                 mutex_unlock(&xgmi_mutex);
     753             : 
     754             :                 amdgpu_put_xgmi_hive(hive);
     755             :         }
     756             : 
     757             :         return 0;
     758             : }
     759             : 
     760           0 : static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
     761             : {
     762           0 :         if (!adev->gmc.xgmi.supported ||
     763           0 :             adev->gmc.xgmi.num_physical_nodes == 0)
     764             :                 return 0;
     765             : 
     766           0 :         adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev);
     767             : 
     768           0 :         return amdgpu_ras_block_late_init(adev, ras_block);
     769             : }
     770             : 
     771           0 : uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
     772             :                                            uint64_t addr)
     773             : {
     774           0 :         struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi;
     775           0 :         return (addr + xgmi->physical_node_id * xgmi->node_segment_size);
     776             : }
     777             : 
     778             : static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg)
     779             : {
     780           0 :         WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF);
     781           0 :         WREG32_PCIE(pcs_status_reg, 0);
     782             : }
     783             : 
     784           0 : static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
     785             : {
     786             :         uint32_t i;
     787             : 
     788           0 :         switch (adev->asic_type) {
     789             :         case CHIP_ARCTURUS:
     790           0 :                 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++)
     791           0 :                         pcs_clear_status(adev,
     792           0 :                                          xgmi_pcs_err_status_reg_arct[i]);
     793             :                 break;
     794             :         case CHIP_VEGA20:
     795           0 :                 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++)
     796           0 :                         pcs_clear_status(adev,
     797           0 :                                          xgmi_pcs_err_status_reg_vg20[i]);
     798             :                 break;
     799             :         case CHIP_ALDEBARAN:
     800           0 :                 for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++)
     801           0 :                         pcs_clear_status(adev,
     802           0 :                                          xgmi3x16_pcs_err_status_reg_aldebaran[i]);
     803           0 :                 for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++)
     804           0 :                         pcs_clear_status(adev,
     805           0 :                                          walf_pcs_err_status_reg_aldebaran[i]);
     806             :                 break;
     807             :         default:
     808             :                 break;
     809             :         }
     810           0 : }
     811             : 
     812           0 : static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
     813             :                                               uint32_t value,
     814             :                                               uint32_t *ue_count,
     815             :                                               uint32_t *ce_count,
     816             :                                               bool is_xgmi_pcs)
     817             : {
     818             :         int i;
     819             :         int ue_cnt;
     820             : 
     821           0 :         if (is_xgmi_pcs) {
     822             :                 /* query xgmi pcs error status,
     823             :                  * only ue is supported */
     824           0 :                 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_ras_fields); i ++) {
     825           0 :                         ue_cnt = (value &
     826           0 :                                   xgmi_pcs_ras_fields[i].pcs_err_mask) >>
     827           0 :                                   xgmi_pcs_ras_fields[i].pcs_err_shift;
     828           0 :                         if (ue_cnt) {
     829           0 :                                 dev_info(adev->dev, "%s detected\n",
     830             :                                          xgmi_pcs_ras_fields[i].err_name);
     831           0 :                                 *ue_count += ue_cnt;
     832             :                         }
     833             :                 }
     834             :         } else {
     835             :                 /* query wafl pcs error status,
     836             :                  * only ue is supported */
     837           0 :                 for (i = 0; i < ARRAY_SIZE(wafl_pcs_ras_fields); i++) {
     838           0 :                         ue_cnt = (value &
     839           0 :                                   wafl_pcs_ras_fields[i].pcs_err_mask) >>
     840           0 :                                   wafl_pcs_ras_fields[i].pcs_err_shift;
     841           0 :                         if (ue_cnt) {
     842           0 :                                 dev_info(adev->dev, "%s detected\n",
     843             :                                          wafl_pcs_ras_fields[i].err_name);
     844           0 :                                 *ue_count += ue_cnt;
     845             :                         }
     846             :                 }
     847             :         }
     848             : 
     849           0 :         return 0;
     850             : }
     851             : 
     852           0 : static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
     853             :                                              void *ras_error_status)
     854             : {
     855           0 :         struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
     856             :         int i;
     857             :         uint32_t data;
     858           0 :         uint32_t ue_cnt = 0, ce_cnt = 0;
     859             : 
     860           0 :         if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL))
     861           0 :                 return ;
     862             : 
     863           0 :         err_data->ue_count = 0;
     864           0 :         err_data->ce_count = 0;
     865             : 
     866           0 :         switch (adev->asic_type) {
     867             :         case CHIP_ARCTURUS:
     868             :                 /* check xgmi pcs error */
     869           0 :                 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) {
     870           0 :                         data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]);
     871           0 :                         if (data)
     872           0 :                                 amdgpu_xgmi_query_pcs_error_status(adev,
     873             :                                                 data, &ue_cnt, &ce_cnt, true);
     874             :                 }
     875             :                 /* check wafl pcs error */
     876           0 :                 for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) {
     877           0 :                         data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]);
     878           0 :                         if (data)
     879           0 :                                 amdgpu_xgmi_query_pcs_error_status(adev,
     880             :                                                 data, &ue_cnt, &ce_cnt, false);
     881             :                 }
     882             :                 break;
     883             :         case CHIP_VEGA20:
     884             :                 /* check xgmi pcs error */
     885           0 :                 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) {
     886           0 :                         data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]);
     887           0 :                         if (data)
     888           0 :                                 amdgpu_xgmi_query_pcs_error_status(adev,
     889             :                                                 data, &ue_cnt, &ce_cnt, true);
     890             :                 }
     891             :                 /* check wafl pcs error */
     892           0 :                 for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) {
     893           0 :                         data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]);
     894           0 :                         if (data)
     895           0 :                                 amdgpu_xgmi_query_pcs_error_status(adev,
     896             :                                                 data, &ue_cnt, &ce_cnt, false);
     897             :                 }
     898             :                 break;
     899             :         case CHIP_ALDEBARAN:
     900             :                 /* check xgmi3x16 pcs error */
     901           0 :                 for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) {
     902           0 :                         data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]);
     903           0 :                         if (data)
     904           0 :                                 amdgpu_xgmi_query_pcs_error_status(adev,
     905             :                                                 data, &ue_cnt, &ce_cnt, true);
     906             :                 }
     907             :                 /* check wafl pcs error */
     908           0 :                 for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) {
     909           0 :                         data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i]);
     910           0 :                         if (data)
     911           0 :                                 amdgpu_xgmi_query_pcs_error_status(adev,
     912             :                                                 data, &ue_cnt, &ce_cnt, false);
     913             :                 }
     914             :                 break;
     915             :         default:
     916           0 :                 dev_warn(adev->dev, "XGMI RAS error query not supported");
     917           0 :                 break;
     918             :         }
     919             : 
     920           0 :         adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev);
     921             : 
     922           0 :         err_data->ue_count += ue_cnt;
     923           0 :         err_data->ce_count += ce_cnt;
     924             : }
     925             : 
     926             : /* Trigger XGMI/WAFL error */
     927           0 : static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,  void *inject_if)
     928             : {
     929           0 :         int ret = 0;
     930           0 :         struct ta_ras_trigger_error_input *block_info =
     931             :                                 (struct ta_ras_trigger_error_input *)inject_if;
     932             : 
     933           0 :         if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
     934           0 :                 dev_warn(adev->dev, "Failed to disallow df cstate");
     935             : 
     936           0 :         if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
     937           0 :                 dev_warn(adev->dev, "Failed to disallow XGMI power down");
     938             : 
     939           0 :         ret = psp_ras_trigger_error(&adev->psp, block_info);
     940             : 
     941           0 :         if (amdgpu_ras_intr_triggered())
     942             :                 return ret;
     943             : 
     944           0 :         if (amdgpu_dpm_allow_xgmi_power_down(adev, true))
     945           0 :                 dev_warn(adev->dev, "Failed to allow XGMI power down");
     946             : 
     947           0 :         if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
     948           0 :                 dev_warn(adev->dev, "Failed to allow df cstate");
     949             : 
     950             :         return ret;
     951             : }
     952             : 
     953             : struct amdgpu_ras_block_hw_ops  xgmi_ras_hw_ops = {
     954             :         .query_ras_error_count = amdgpu_xgmi_query_ras_error_count,
     955             :         .reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count,
     956             :         .ras_error_inject = amdgpu_ras_error_inject_xgmi,
     957             : };
     958             : 
     959             : struct amdgpu_xgmi_ras xgmi_ras = {
     960             :         .ras_block = {
     961             :                 .ras_comm = {
     962             :                         .name = "xgmi_wafl",
     963             :                         .block = AMDGPU_RAS_BLOCK__XGMI_WAFL,
     964             :                         .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
     965             :                 },
     966             :                 .hw_ops = &xgmi_ras_hw_ops,
     967             :                 .ras_late_init = amdgpu_xgmi_ras_late_init,
     968             :         },
     969             : };

Generated by: LCOV version 1.14