LCOV - code coverage report
Current view: top level - drivers/gpu/drm/amd/amdgpu - amdgpu_ras_eeprom.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 0 453 0.0 %
Date: 2022-12-09 01:23:36 Functions: 0 22 0.0 %

          Line data    Source code
       1             : /*
       2             :  * Copyright 2019 Advanced Micro Devices, Inc.
       3             :  *
       4             :  * Permission is hereby granted, free of charge, to any person obtaining a
       5             :  * copy of this software and associated documentation files (the "Software"),
       6             :  * to deal in the Software without restriction, including without limitation
       7             :  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
       8             :  * and/or sell copies of the Software, and to permit persons to whom the
       9             :  * Software is furnished to do so, subject to the following conditions:
      10             :  *
      11             :  * The above copyright notice and this permission notice shall be included in
      12             :  * all copies or substantial portions of the Software.
      13             :  *
      14             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
      15             :  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      16             :  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
      17             :  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
      18             :  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
      19             :  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
      20             :  * OTHER DEALINGS IN THE SOFTWARE.
      21             :  *
      22             :  */
      23             : 
      24             : #include "amdgpu_ras_eeprom.h"
      25             : #include "amdgpu.h"
      26             : #include "amdgpu_ras.h"
      27             : #include <linux/bits.h>
      28             : #include "atom.h"
      29             : #include "amdgpu_eeprom.h"
      30             : #include "amdgpu_atomfirmware.h"
      31             : #include <linux/debugfs.h>
      32             : #include <linux/uaccess.h>
      33             : 
      34             : #include "amdgpu_reset.h"
      35             : 
      36             : #define EEPROM_I2C_MADDR_VEGA20         0x0
      37             : #define EEPROM_I2C_MADDR_ARCTURUS       0x40000
      38             : #define EEPROM_I2C_MADDR_ARCTURUS_D342  0x0
      39             : #define EEPROM_I2C_MADDR_SIENNA_CICHLID 0x0
      40             : #define EEPROM_I2C_MADDR_ALDEBARAN      0x0
      41             : 
      42             : /*
      43             :  * The 2 macros bellow represent the actual size in bytes that
      44             :  * those entities occupy in the EEPROM memory.
      45             :  * RAS_TABLE_RECORD_SIZE is different than sizeof(eeprom_table_record) which
      46             :  * uses uint64 to store 6b fields such as retired_page.
      47             :  */
      48             : #define RAS_TABLE_HEADER_SIZE   20
      49             : #define RAS_TABLE_RECORD_SIZE   24
      50             : 
      51             : /* Table hdr is 'AMDR' */
      52             : #define RAS_TABLE_HDR_VAL       0x414d4452
      53             : #define RAS_TABLE_VER           0x00010000
      54             : 
      55             : /* Bad GPU tag ‘BADG’ */
      56             : #define RAS_TABLE_HDR_BAD       0x42414447
      57             : 
      58             : /* Assume 2-Mbit size EEPROM and take up the whole space. */
      59             : #define RAS_TBL_SIZE_BYTES      (256 * 1024)
      60             : #define RAS_TABLE_START         0
      61             : #define RAS_HDR_START           RAS_TABLE_START
      62             : #define RAS_RECORD_START        (RAS_HDR_START + RAS_TABLE_HEADER_SIZE)
      63             : #define RAS_MAX_RECORD_COUNT    ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \
      64             :                                  / RAS_TABLE_RECORD_SIZE)
      65             : 
      66             : /* Given a zero-based index of an EEPROM RAS record, yields the EEPROM
      67             :  * offset off of RAS_TABLE_START.  That is, this is something you can
      68             :  * add to control->i2c_address, and then tell I2C layer to read
      69             :  * from/write to there. _N is the so called absolute index,
      70             :  * because it starts right after the table header.
      71             :  */
      72             : #define RAS_INDEX_TO_OFFSET(_C, _N) ((_C)->ras_record_offset + \
      73             :                                      (_N) * RAS_TABLE_RECORD_SIZE)
      74             : 
      75             : #define RAS_OFFSET_TO_INDEX(_C, _O) (((_O) - \
      76             :                                       (_C)->ras_record_offset) / RAS_TABLE_RECORD_SIZE)
      77             : 
      78             : /* Given a 0-based relative record index, 0, 1, 2, ..., etc., off
      79             :  * of "fri", return the absolute record index off of the end of
      80             :  * the table header.
      81             :  */
      82             : #define RAS_RI_TO_AI(_C, _I) (((_I) + (_C)->ras_fri) % \
      83             :                               (_C)->ras_max_record_count)
      84             : 
      85             : #define RAS_NUM_RECS(_tbl_hdr)  (((_tbl_hdr)->tbl_size - \
      86             :                                   RAS_TABLE_HEADER_SIZE) / RAS_TABLE_RECORD_SIZE)
      87             : 
      88             : #define to_amdgpu_device(x) (container_of(x, struct amdgpu_ras, eeprom_control))->adev
      89             : 
      90             : static bool __is_ras_eeprom_supported(struct amdgpu_device *adev)
      91             : {
      92           0 :         return  adev->asic_type == CHIP_VEGA20 ||
      93           0 :                 adev->asic_type == CHIP_ARCTURUS ||
      94           0 :                 adev->asic_type == CHIP_SIENNA_CICHLID ||
      95             :                 adev->asic_type == CHIP_ALDEBARAN;
      96             : }
      97             : 
      98           0 : static bool __get_eeprom_i2c_addr_arct(struct amdgpu_device *adev,
      99             :                                        struct amdgpu_ras_eeprom_control *control)
     100             : {
     101           0 :         struct atom_context *atom_ctx = adev->mode_info.atom_context;
     102             : 
     103           0 :         if (!control || !atom_ctx)
     104             :                 return false;
     105             : 
     106           0 :         if (strnstr(atom_ctx->vbios_version,
     107             :                     "D342",
     108             :                     sizeof(atom_ctx->vbios_version)))
     109           0 :                 control->i2c_address = EEPROM_I2C_MADDR_ARCTURUS_D342;
     110             :         else
     111           0 :                 control->i2c_address = EEPROM_I2C_MADDR_ARCTURUS;
     112             : 
     113             :         return true;
     114             : }
     115             : 
     116           0 : static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev,
     117             :                                   struct amdgpu_ras_eeprom_control *control)
     118             : {
     119             :         u8 i2c_addr;
     120             : 
     121           0 :         if (!control)
     122             :                 return false;
     123             : 
     124           0 :         if (amdgpu_atomfirmware_ras_rom_addr(adev, &i2c_addr)) {
     125             :                 /* The address given by VBIOS is an 8-bit, wire-format
     126             :                  * address, i.e. the most significant byte.
     127             :                  *
     128             :                  * Normalize it to a 19-bit EEPROM address. Remove the
     129             :                  * device type identifier and make it a 7-bit address;
     130             :                  * then make it a 19-bit EEPROM address. See top of
     131             :                  * amdgpu_eeprom.c.
     132             :                  */
     133           0 :                 i2c_addr = (i2c_addr & 0x0F) >> 1;
     134           0 :                 control->i2c_address = ((u32) i2c_addr) << 16;
     135             : 
     136           0 :                 return true;
     137             :         }
     138             : 
     139           0 :         switch (adev->asic_type) {
     140             :         case CHIP_VEGA20:
     141           0 :                 control->i2c_address = EEPROM_I2C_MADDR_VEGA20;
     142           0 :                 break;
     143             : 
     144             :         case CHIP_ARCTURUS:
     145           0 :                 return __get_eeprom_i2c_addr_arct(adev, control);
     146             : 
     147             :         case CHIP_SIENNA_CICHLID:
     148           0 :                 control->i2c_address = EEPROM_I2C_MADDR_SIENNA_CICHLID;
     149           0 :                 break;
     150             : 
     151             :         case CHIP_ALDEBARAN:
     152           0 :                 control->i2c_address = EEPROM_I2C_MADDR_ALDEBARAN;
     153           0 :                 break;
     154             : 
     155             :         default:
     156             :                 return false;
     157             :         }
     158             : 
     159             :         return true;
     160             : }
     161             : 
     162             : static void
     163             : __encode_table_header_to_buf(struct amdgpu_ras_eeprom_table_header *hdr,
     164             :                              unsigned char *buf)
     165             : {
     166           0 :         u32 *pp = (uint32_t *)buf;
     167             : 
     168           0 :         pp[0] = cpu_to_le32(hdr->header);
     169           0 :         pp[1] = cpu_to_le32(hdr->version);
     170           0 :         pp[2] = cpu_to_le32(hdr->first_rec_offset);
     171           0 :         pp[3] = cpu_to_le32(hdr->tbl_size);
     172           0 :         pp[4] = cpu_to_le32(hdr->checksum);
     173             : }
     174             : 
     175             : static void
     176             : __decode_table_header_from_buf(struct amdgpu_ras_eeprom_table_header *hdr,
     177             :                                unsigned char *buf)
     178             : {
     179           0 :         u32 *pp = (uint32_t *)buf;
     180             : 
     181           0 :         hdr->header        = le32_to_cpu(pp[0]);
     182           0 :         hdr->version       = le32_to_cpu(pp[1]);
     183           0 :         hdr->first_rec_offset = le32_to_cpu(pp[2]);
     184           0 :         hdr->tbl_size              = le32_to_cpu(pp[3]);
     185           0 :         hdr->checksum              = le32_to_cpu(pp[4]);
     186             : }
     187             : 
     188           0 : static int __write_table_header(struct amdgpu_ras_eeprom_control *control)
     189             : {
     190             :         u8 buf[RAS_TABLE_HEADER_SIZE];
     191           0 :         struct amdgpu_device *adev = to_amdgpu_device(control);
     192             :         int res;
     193             : 
     194           0 :         memset(buf, 0, sizeof(buf));
     195           0 :         __encode_table_header_to_buf(&control->tbl_hdr, buf);
     196             : 
     197             :         /* i2c may be unstable in gpu reset */
     198           0 :         down_read(&adev->reset_domain->sem);
     199           0 :         res = amdgpu_eeprom_write(adev->pm.ras_eeprom_i2c_bus,
     200           0 :                                   control->i2c_address +
     201           0 :                                   control->ras_header_offset,
     202             :                                   buf, RAS_TABLE_HEADER_SIZE);
     203           0 :         up_read(&adev->reset_domain->sem);
     204             : 
     205           0 :         if (res < 0) {
     206           0 :                 DRM_ERROR("Failed to write EEPROM table header:%d", res);
     207           0 :         } else if (res < RAS_TABLE_HEADER_SIZE) {
     208           0 :                 DRM_ERROR("Short write:%d out of %d\n",
     209             :                           res, RAS_TABLE_HEADER_SIZE);
     210           0 :                 res = -EIO;
     211             :         } else {
     212             :                 res = 0;
     213             :         }
     214             : 
     215           0 :         return res;
     216             : }
     217             : 
     218             : static u8 __calc_hdr_byte_sum(const struct amdgpu_ras_eeprom_control *control)
     219             : {
     220             :         int ii;
     221             :         u8  *pp, csum;
     222             :         size_t sz;
     223             : 
     224             :         /* Header checksum, skip checksum field in the calculation */
     225           0 :         sz = sizeof(control->tbl_hdr) - sizeof(control->tbl_hdr.checksum);
     226           0 :         pp = (u8 *) &control->tbl_hdr;
     227           0 :         csum = 0;
     228           0 :         for (ii = 0; ii < sz; ii++, pp++)
     229           0 :                 csum += *pp;
     230             : 
     231             :         return csum;
     232             : }
     233             : 
     234           0 : static int amdgpu_ras_eeprom_correct_header_tag(
     235             :         struct amdgpu_ras_eeprom_control *control,
     236             :         uint32_t header)
     237             : {
     238           0 :         struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
     239             :         u8 *hh;
     240             :         int res;
     241             :         u8 csum;
     242             : 
     243           0 :         csum = -hdr->checksum;
     244             : 
     245           0 :         hh = (void *) &hdr->header;
     246           0 :         csum -= (hh[0] + hh[1] + hh[2] + hh[3]);
     247           0 :         hh = (void *) &header;
     248           0 :         csum += hh[0] + hh[1] + hh[2] + hh[3];
     249           0 :         csum = -csum;
     250           0 :         mutex_lock(&control->ras_tbl_mutex);
     251           0 :         hdr->header = header;
     252           0 :         hdr->checksum = csum;
     253           0 :         res = __write_table_header(control);
     254           0 :         mutex_unlock(&control->ras_tbl_mutex);
     255             : 
     256           0 :         return res;
     257             : }
     258             : 
     259             : /**
     260             :  * amdgpu_ras_eeprom_reset_table -- Reset the RAS EEPROM table
     261             :  * @control: pointer to control structure
     262             :  *
     263             :  * Reset the contents of the header of the RAS EEPROM table.
     264             :  * Return 0 on success, -errno on error.
     265             :  */
     266           0 : int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
     267             : {
     268           0 :         struct amdgpu_device *adev = to_amdgpu_device(control);
     269           0 :         struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
     270           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
     271             :         u8 csum;
     272             :         int res;
     273             : 
     274           0 :         mutex_lock(&control->ras_tbl_mutex);
     275             : 
     276           0 :         hdr->header = RAS_TABLE_HDR_VAL;
     277           0 :         hdr->version = RAS_TABLE_VER;
     278           0 :         hdr->first_rec_offset = RAS_RECORD_START;
     279           0 :         hdr->tbl_size = RAS_TABLE_HEADER_SIZE;
     280             : 
     281           0 :         csum = __calc_hdr_byte_sum(control);
     282           0 :         csum = -csum;
     283           0 :         hdr->checksum = csum;
     284           0 :         res = __write_table_header(control);
     285             : 
     286           0 :         control->ras_num_recs = 0;
     287           0 :         control->ras_fri = 0;
     288             : 
     289           0 :         amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs);
     290             : 
     291           0 :         control->bad_channel_bitmap = 0;
     292           0 :         amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap);
     293           0 :         con->update_channel_flag = false;
     294             : 
     295           0 :         amdgpu_ras_debugfs_set_ret_size(control);
     296             : 
     297           0 :         mutex_unlock(&control->ras_tbl_mutex);
     298             : 
     299           0 :         return res;
     300             : }
     301             : 
     302             : static void
     303           0 : __encode_table_record_to_buf(struct amdgpu_ras_eeprom_control *control,
     304             :                              struct eeprom_table_record *record,
     305             :                              unsigned char *buf)
     306             : {
     307             :         __le64 tmp = 0;
     308           0 :         int i = 0;
     309             : 
     310             :         /* Next are all record fields according to EEPROM page spec in LE foramt */
     311           0 :         buf[i++] = record->err_type;
     312             : 
     313           0 :         buf[i++] = record->bank;
     314             : 
     315           0 :         tmp = cpu_to_le64(record->ts);
     316           0 :         memcpy(buf + i, &tmp, 8);
     317           0 :         i += 8;
     318             : 
     319           0 :         tmp = cpu_to_le64((record->offset & 0xffffffffffff));
     320           0 :         memcpy(buf + i, &tmp, 6);
     321           0 :         i += 6;
     322             : 
     323           0 :         buf[i++] = record->mem_channel;
     324           0 :         buf[i++] = record->mcumc_id;
     325             : 
     326           0 :         tmp = cpu_to_le64((record->retired_page & 0xffffffffffff));
     327           0 :         memcpy(buf + i, &tmp, 6);
     328           0 : }
     329             : 
     330             : static void
     331           0 : __decode_table_record_from_buf(struct amdgpu_ras_eeprom_control *control,
     332             :                                struct eeprom_table_record *record,
     333             :                                unsigned char *buf)
     334             : {
     335           0 :         __le64 tmp = 0;
     336           0 :         int i =  0;
     337             : 
     338             :         /* Next are all record fields according to EEPROM page spec in LE foramt */
     339           0 :         record->err_type = buf[i++];
     340             : 
     341           0 :         record->bank = buf[i++];
     342             : 
     343           0 :         memcpy(&tmp, buf + i, 8);
     344           0 :         record->ts = le64_to_cpu(tmp);
     345           0 :         i += 8;
     346             : 
     347           0 :         memcpy(&tmp, buf + i, 6);
     348           0 :         record->offset = (le64_to_cpu(tmp) & 0xffffffffffff);
     349           0 :         i += 6;
     350             : 
     351           0 :         record->mem_channel = buf[i++];
     352           0 :         record->mcumc_id = buf[i++];
     353             : 
     354           0 :         memcpy(&tmp, buf + i,  6);
     355           0 :         record->retired_page = (le64_to_cpu(tmp) & 0xffffffffffff);
     356           0 : }
     357             : 
     358           0 : bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
     359             : {
     360           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
     361             : 
     362           0 :         if (!__is_ras_eeprom_supported(adev))
     363             :                 return false;
     364             : 
     365             :         /* skip check eeprom table for VEGA20 Gaming */
     366           0 :         if (!con)
     367             :                 return false;
     368             :         else
     369           0 :                 if (!(con->features & BIT(AMDGPU_RAS_BLOCK__UMC)))
     370             :                         return false;
     371             : 
     372           0 :         if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
     373           0 :                 dev_warn(adev->dev, "This GPU is in BAD status.");
     374           0 :                 dev_warn(adev->dev, "Please retire it or set a larger "
     375             :                          "threshold value when reloading driver.\n");
     376           0 :                 return true;
     377             :         }
     378             : 
     379             :         return false;
     380             : }
     381             : 
     382             : /**
     383             :  * __amdgpu_ras_eeprom_write -- write indexed from buffer to EEPROM
     384             :  * @control: pointer to control structure
     385             :  * @buf: pointer to buffer containing data to write
     386             :  * @fri: start writing at this index
     387             :  * @num: number of records to write
     388             :  *
     389             :  * The caller must hold the table mutex in @control.
     390             :  * Return 0 on success, -errno otherwise.
     391             :  */
     392           0 : static int __amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control,
     393             :                                      u8 *buf, const u32 fri, const u32 num)
     394             : {
     395           0 :         struct amdgpu_device *adev = to_amdgpu_device(control);
     396             :         u32 buf_size;
     397             :         int res;
     398             : 
     399             :         /* i2c may be unstable in gpu reset */
     400           0 :         down_read(&adev->reset_domain->sem);
     401           0 :         buf_size = num * RAS_TABLE_RECORD_SIZE;
     402           0 :         res = amdgpu_eeprom_write(adev->pm.ras_eeprom_i2c_bus,
     403           0 :                                   control->i2c_address +
     404           0 :                                   RAS_INDEX_TO_OFFSET(control, fri),
     405             :                                   buf, buf_size);
     406           0 :         up_read(&adev->reset_domain->sem);
     407           0 :         if (res < 0) {
     408           0 :                 DRM_ERROR("Writing %d EEPROM table records error:%d",
     409             :                           num, res);
     410           0 :         } else if (res < buf_size) {
     411             :                 /* Short write, return error.
     412             :                  */
     413           0 :                 DRM_ERROR("Wrote %d records out of %d",
     414             :                           res / RAS_TABLE_RECORD_SIZE, num);
     415           0 :                 res = -EIO;
     416             :         } else {
     417             :                 res = 0;
     418             :         }
     419             : 
     420           0 :         return res;
     421             : }
     422             : 
     423             : static int
     424           0 : amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
     425             :                                struct eeprom_table_record *record,
     426             :                                const u32 num)
     427             : {
     428           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control));
     429             :         u32 a, b, i;
     430             :         u8 *buf, *pp;
     431             :         int res;
     432             : 
     433           0 :         buf = kcalloc(num, RAS_TABLE_RECORD_SIZE, GFP_KERNEL);
     434           0 :         if (!buf)
     435             :                 return -ENOMEM;
     436             : 
     437             :         /* Encode all of them in one go.
     438             :          */
     439             :         pp = buf;
     440           0 :         for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
     441           0 :                 __encode_table_record_to_buf(control, &record[i], pp);
     442             : 
     443             :                 /* update bad channel bitmap */
     444           0 :                 if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
     445           0 :                         control->bad_channel_bitmap |= 1 << record[i].mem_channel;
     446           0 :                         con->update_channel_flag = true;
     447             :                 }
     448             :         }
     449             : 
     450             :         /* a, first record index to write into.
     451             :          * b, last record index to write into.
     452             :          * a = first index to read (fri) + number of records in the table,
     453             :          * b = a + @num - 1.
     454             :          * Let N = control->ras_max_num_record_count, then we have,
     455             :          * case 0: 0 <= a <= b < N,
     456             :          *   just append @num records starting at a;
     457             :          * case 1: 0 <= a < N <= b,
     458             :          *   append (N - a) records starting at a, and
     459             :          *   append the remainder,  b % N + 1, starting at 0.
     460             :          * case 2: 0 <= fri < N <= a <= b, then modulo N we get two subcases,
     461             :          * case 2a: 0 <= a <= b < N
     462             :          *   append num records starting at a; and fix fri if b overwrote it,
     463             :          *   and since a <= b, if b overwrote it then a must've also,
     464             :          *   and if b didn't overwrite it, then a didn't also.
     465             :          * case 2b: 0 <= b < a < N
     466             :          *   write num records starting at a, which wraps around 0=N
     467             :          *   and overwrite fri unconditionally. Now from case 2a,
     468             :          *   this means that b eclipsed fri to overwrite it and wrap
     469             :          *   around 0 again, i.e. b = 2N+r pre modulo N, so we unconditionally
     470             :          *   set fri = b + 1 (mod N).
     471             :          * Now, since fri is updated in every case, except the trivial case 0,
     472             :          * the number of records present in the table after writing, is,
     473             :          * num_recs - 1 = b - fri (mod N), and we take the positive value,
     474             :          * by adding an arbitrary multiple of N before taking the modulo N
     475             :          * as shown below.
     476             :          */
     477           0 :         a = control->ras_fri + control->ras_num_recs;
     478           0 :         b = a + num  - 1;
     479           0 :         if (b < control->ras_max_record_count) {
     480           0 :                 res = __amdgpu_ras_eeprom_write(control, buf, a, num);
     481           0 :         } else if (a < control->ras_max_record_count) {
     482             :                 u32 g0, g1;
     483             : 
     484           0 :                 g0 = control->ras_max_record_count - a;
     485           0 :                 g1 = b % control->ras_max_record_count + 1;
     486           0 :                 res = __amdgpu_ras_eeprom_write(control, buf, a, g0);
     487           0 :                 if (res)
     488             :                         goto Out;
     489           0 :                 res = __amdgpu_ras_eeprom_write(control,
     490           0 :                                                 buf + g0 * RAS_TABLE_RECORD_SIZE,
     491             :                                                 0, g1);
     492           0 :                 if (res)
     493             :                         goto Out;
     494           0 :                 if (g1 > control->ras_fri)
     495           0 :                         control->ras_fri = g1 % control->ras_max_record_count;
     496             :         } else {
     497           0 :                 a %= control->ras_max_record_count;
     498           0 :                 b %= control->ras_max_record_count;
     499             : 
     500           0 :                 if (a <= b) {
     501             :                         /* Note that, b - a + 1 = num. */
     502           0 :                         res = __amdgpu_ras_eeprom_write(control, buf, a, num);
     503           0 :                         if (res)
     504             :                                 goto Out;
     505           0 :                         if (b >= control->ras_fri)
     506           0 :                                 control->ras_fri = (b + 1) % control->ras_max_record_count;
     507             :                 } else {
     508             :                         u32 g0, g1;
     509             : 
     510             :                         /* b < a, which means, we write from
     511             :                          * a to the end of the table, and from
     512             :                          * the start of the table to b.
     513             :                          */
     514           0 :                         g0 = control->ras_max_record_count - a;
     515           0 :                         g1 = b + 1;
     516           0 :                         res = __amdgpu_ras_eeprom_write(control, buf, a, g0);
     517           0 :                         if (res)
     518             :                                 goto Out;
     519           0 :                         res = __amdgpu_ras_eeprom_write(control,
     520           0 :                                                         buf + g0 * RAS_TABLE_RECORD_SIZE,
     521             :                                                         0, g1);
     522           0 :                         if (res)
     523             :                                 goto Out;
     524           0 :                         control->ras_fri = g1 % control->ras_max_record_count;
     525             :                 }
     526             :         }
     527           0 :         control->ras_num_recs = 1 + (control->ras_max_record_count + b
     528           0 :                                      - control->ras_fri)
     529           0 :                 % control->ras_max_record_count;
     530             : Out:
     531           0 :         kfree(buf);
     532           0 :         return res;
     533             : }
     534             : 
     535             : static int
     536           0 : amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
     537             : {
     538           0 :         struct amdgpu_device *adev = to_amdgpu_device(control);
     539           0 :         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
     540             :         u8 *buf, *pp, csum;
     541             :         u32 buf_size;
     542             :         int res;
     543             : 
     544             :         /* Modify the header if it exceeds.
     545             :          */
     546           0 :         if (amdgpu_bad_page_threshold != 0 &&
     547           0 :             control->ras_num_recs >= ras->bad_page_cnt_threshold) {
     548           0 :                 dev_warn(adev->dev,
     549             :                         "Saved bad pages %d reaches threshold value %d\n",
     550             :                         control->ras_num_recs, ras->bad_page_cnt_threshold);
     551           0 :                 control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
     552             :         }
     553             : 
     554           0 :         control->tbl_hdr.version = RAS_TABLE_VER;
     555           0 :         control->tbl_hdr.first_rec_offset = RAS_INDEX_TO_OFFSET(control, control->ras_fri);
     556           0 :         control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
     557           0 :         control->tbl_hdr.checksum = 0;
     558             : 
     559           0 :         buf_size = control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
     560           0 :         buf = kcalloc(control->ras_num_recs, RAS_TABLE_RECORD_SIZE, GFP_KERNEL);
     561           0 :         if (!buf) {
     562           0 :                 DRM_ERROR("allocating memory for table of size %d bytes failed\n",
     563             :                           control->tbl_hdr.tbl_size);
     564           0 :                 res = -ENOMEM;
     565           0 :                 goto Out;
     566             :         }
     567             : 
     568           0 :         down_read(&adev->reset_domain->sem);
     569           0 :         res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
     570           0 :                                  control->i2c_address +
     571           0 :                                  control->ras_record_offset,
     572             :                                  buf, buf_size);
     573           0 :         up_read(&adev->reset_domain->sem);
     574           0 :         if (res < 0) {
     575           0 :                 DRM_ERROR("EEPROM failed reading records:%d\n",
     576             :                           res);
     577           0 :                 goto Out;
     578           0 :         } else if (res < buf_size) {
     579           0 :                 DRM_ERROR("EEPROM read %d out of %d bytes\n",
     580             :                           res, buf_size);
     581           0 :                 res = -EIO;
     582           0 :                 goto Out;
     583             :         }
     584             : 
     585             :         /* Recalc the checksum.
     586             :          */
     587             :         csum = 0;
     588           0 :         for (pp = buf; pp < buf + buf_size; pp++)
     589           0 :                 csum += *pp;
     590             : 
     591           0 :         csum += __calc_hdr_byte_sum(control);
     592             :         /* avoid sign extension when assigning to "checksum" */
     593           0 :         csum = -csum;
     594           0 :         control->tbl_hdr.checksum = csum;
     595           0 :         res = __write_table_header(control);
     596             : Out:
     597           0 :         kfree(buf);
     598           0 :         return res;
     599             : }
     600             : 
     601             : /**
     602             :  * amdgpu_ras_eeprom_append -- append records to the EEPROM RAS table
     603             :  * @control: pointer to control structure
     604             :  * @record: array of records to append
     605             :  * @num: number of records in @record array
     606             :  *
     607             :  * Append @num records to the table, calculate the checksum and write
     608             :  * the table back to EEPROM. The maximum number of records that
     609             :  * can be appended is between 1 and control->ras_max_record_count,
     610             :  * regardless of how many records are already stored in the table.
     611             :  *
     612             :  * Return 0 on success or if EEPROM is not supported, -errno on error.
     613             :  */
     614           0 : int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
     615             :                              struct eeprom_table_record *record,
     616             :                              const u32 num)
     617             : {
     618           0 :         struct amdgpu_device *adev = to_amdgpu_device(control);
     619             :         int res;
     620             : 
     621           0 :         if (!__is_ras_eeprom_supported(adev))
     622             :                 return 0;
     623             : 
     624           0 :         if (num == 0) {
     625           0 :                 DRM_ERROR("will not append 0 records\n");
     626           0 :                 return -EINVAL;
     627           0 :         } else if (num > control->ras_max_record_count) {
     628           0 :                 DRM_ERROR("cannot append %d records than the size of table %d\n",
     629             :                           num, control->ras_max_record_count);
     630           0 :                 return -EINVAL;
     631             :         }
     632             : 
     633           0 :         mutex_lock(&control->ras_tbl_mutex);
     634             : 
     635           0 :         res = amdgpu_ras_eeprom_append_table(control, record, num);
     636           0 :         if (!res)
     637           0 :                 res = amdgpu_ras_eeprom_update_header(control);
     638           0 :         if (!res)
     639             :                 amdgpu_ras_debugfs_set_ret_size(control);
     640             : 
     641           0 :         mutex_unlock(&control->ras_tbl_mutex);
     642           0 :         return res;
     643             : }
     644             : 
     645             : /**
     646             :  * __amdgpu_ras_eeprom_read -- read indexed from EEPROM into buffer
     647             :  * @control: pointer to control structure
     648             :  * @buf: pointer to buffer to read into
     649             :  * @fri: first record index, start reading at this index, absolute index
     650             :  * @num: number of records to read
     651             :  *
     652             :  * The caller must hold the table mutex in @control.
     653             :  * Return 0 on success, -errno otherwise.
     654             :  */
     655           0 : static int __amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
     656             :                                     u8 *buf, const u32 fri, const u32 num)
     657             : {
     658           0 :         struct amdgpu_device *adev = to_amdgpu_device(control);
     659             :         u32 buf_size;
     660             :         int res;
     661             : 
     662             :         /* i2c may be unstable in gpu reset */
     663           0 :         down_read(&adev->reset_domain->sem);
     664           0 :         buf_size = num * RAS_TABLE_RECORD_SIZE;
     665           0 :         res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
     666           0 :                                  control->i2c_address +
     667           0 :                                  RAS_INDEX_TO_OFFSET(control, fri),
     668             :                                  buf, buf_size);
     669           0 :         up_read(&adev->reset_domain->sem);
     670           0 :         if (res < 0) {
     671           0 :                 DRM_ERROR("Reading %d EEPROM table records error:%d",
     672             :                           num, res);
     673           0 :         } else if (res < buf_size) {
     674             :                 /* Short read, return error.
     675             :                  */
     676           0 :                 DRM_ERROR("Read %d records out of %d",
     677             :                           res / RAS_TABLE_RECORD_SIZE, num);
     678           0 :                 res = -EIO;
     679             :         } else {
     680             :                 res = 0;
     681             :         }
     682             : 
     683           0 :         return res;
     684             : }
     685             : 
     686             : /**
     687             :  * amdgpu_ras_eeprom_read -- read EEPROM
     688             :  * @control: pointer to control structure
     689             :  * @record: array of records to read into
     690             :  * @num: number of records in @record
     691             :  *
     692             :  * Reads num records from the RAS table in EEPROM and
     693             :  * writes the data into @record array.
     694             :  *
     695             :  * Returns 0 on success, -errno on error.
     696             :  */
     697           0 : int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
     698             :                            struct eeprom_table_record *record,
     699             :                            const u32 num)
     700             : {
     701           0 :         struct amdgpu_device *adev = to_amdgpu_device(control);
     702           0 :         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
     703             :         int i, res;
     704             :         u8 *buf, *pp;
     705             :         u32 g0, g1;
     706             : 
     707           0 :         if (!__is_ras_eeprom_supported(adev))
     708             :                 return 0;
     709             : 
     710           0 :         if (num == 0) {
     711           0 :                 DRM_ERROR("will not read 0 records\n");
     712           0 :                 return -EINVAL;
     713           0 :         } else if (num > control->ras_num_recs) {
     714           0 :                 DRM_ERROR("too many records to read:%d available:%d\n",
     715             :                           num, control->ras_num_recs);
     716           0 :                 return -EINVAL;
     717             :         }
     718             : 
     719           0 :         buf = kcalloc(num, RAS_TABLE_RECORD_SIZE, GFP_KERNEL);
     720           0 :         if (!buf)
     721             :                 return -ENOMEM;
     722             : 
     723             :         /* Determine how many records to read, from the first record
     724             :          * index, fri, to the end of the table, and from the beginning
     725             :          * of the table, such that the total number of records is
     726             :          * @num, and we handle wrap around when fri > 0 and
     727             :          * fri + num > RAS_MAX_RECORD_COUNT.
     728             :          *
     729             :          * First we compute the index of the last element
     730             :          * which would be fetched from each region,
     731             :          * g0 is in [fri, fri + num - 1], and
     732             :          * g1 is in [0, RAS_MAX_RECORD_COUNT - 1].
     733             :          * Then, if g0 < RAS_MAX_RECORD_COUNT, the index of
     734             :          * the last element to fetch, we set g0 to _the number_
     735             :          * of elements to fetch, @num, since we know that the last
     736             :          * indexed to be fetched does not exceed the table.
     737             :          *
     738             :          * If, however, g0 >= RAS_MAX_RECORD_COUNT, then
     739             :          * we set g0 to the number of elements to read
     740             :          * until the end of the table, and g1 to the number of
     741             :          * elements to read from the beginning of the table.
     742             :          */
     743           0 :         g0 = control->ras_fri + num - 1;
     744           0 :         g1 = g0 % control->ras_max_record_count;
     745           0 :         if (g0 < control->ras_max_record_count) {
     746             :                 g0 = num;
     747             :                 g1 = 0;
     748             :         } else {
     749           0 :                 g0 = control->ras_max_record_count - control->ras_fri;
     750           0 :                 g1 += 1;
     751             :         }
     752             : 
     753           0 :         mutex_lock(&control->ras_tbl_mutex);
     754           0 :         res = __amdgpu_ras_eeprom_read(control, buf, control->ras_fri, g0);
     755           0 :         if (res)
     756             :                 goto Out;
     757           0 :         if (g1) {
     758           0 :                 res = __amdgpu_ras_eeprom_read(control,
     759           0 :                                                buf + g0 * RAS_TABLE_RECORD_SIZE,
     760             :                                                0, g1);
     761           0 :                 if (res)
     762             :                         goto Out;
     763             :         }
     764             : 
     765             :         res = 0;
     766             : 
     767             :         /* Read up everything? Then transform.
     768             :          */
     769             :         pp = buf;
     770           0 :         for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
     771           0 :                 __decode_table_record_from_buf(control, &record[i], pp);
     772             : 
     773             :                 /* update bad channel bitmap */
     774           0 :                 if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
     775           0 :                         control->bad_channel_bitmap |= 1 << record[i].mem_channel;
     776           0 :                         con->update_channel_flag = true;
     777             :                 }
     778             :         }
     779             : Out:
     780           0 :         kfree(buf);
     781           0 :         mutex_unlock(&control->ras_tbl_mutex);
     782             : 
     783           0 :         return res;
     784             : }
     785             : 
     786           0 : uint32_t amdgpu_ras_eeprom_max_record_count(void)
     787             : {
     788           0 :         return RAS_MAX_RECORD_COUNT;
     789             : }
     790             : 
     791             : static ssize_t
     792           0 : amdgpu_ras_debugfs_eeprom_size_read(struct file *f, char __user *buf,
     793             :                                     size_t size, loff_t *pos)
     794             : {
     795           0 :         struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
     796           0 :         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
     797           0 :         struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
     798             :         u8 data[50];
     799             :         int res;
     800             : 
     801           0 :         if (!size)
     802           0 :                 return size;
     803             : 
     804           0 :         if (!ras || !control) {
     805           0 :                 res = snprintf(data, sizeof(data), "Not supported\n");
     806             :         } else {
     807           0 :                 res = snprintf(data, sizeof(data), "%d bytes or %d records\n",
     808             :                                RAS_TBL_SIZE_BYTES, control->ras_max_record_count);
     809             :         }
     810             : 
     811           0 :         if (*pos >= res)
     812             :                 return 0;
     813             : 
     814           0 :         res -= *pos;
     815           0 :         res = min_t(size_t, res, size);
     816             : 
     817           0 :         if (copy_to_user(buf, &data[*pos], res))
     818             :                 return -EFAULT;
     819             : 
     820           0 :         *pos += res;
     821             : 
     822           0 :         return res;
     823             : }
     824             : 
     825             : const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops = {
     826             :         .owner = THIS_MODULE,
     827             :         .read = amdgpu_ras_debugfs_eeprom_size_read,
     828             :         .write = NULL,
     829             :         .llseek = default_llseek,
     830             : };
     831             : 
     832             : static const char *tbl_hdr_str = " Signature    Version  FirstOffs       Size   Checksum\n";
     833             : static const char *tbl_hdr_fmt = "0x%08X 0x%08X 0x%08X 0x%08X 0x%08X\n";
     834             : #define tbl_hdr_fmt_size (5 * (2+8) + 4 + 1)
     835             : static const char *rec_hdr_str = "Index  Offset ErrType Bank/CU          TimeStamp      Offs/Addr MemChl MCUMCID    RetiredPage\n";
     836             : static const char *rec_hdr_fmt = "%5d 0x%05X %7s    0x%02X 0x%016llX 0x%012llX   0x%02X    0x%02X 0x%012llX\n";
     837             : #define rec_hdr_fmt_size (5 + 1 + 7 + 1 + 7 + 1 + 7 + 1 + 18 + 1 + 14 + 1 + 6 + 1 + 7 + 1 + 14 + 1)
     838             : 
     839             : static const char *record_err_type_str[AMDGPU_RAS_EEPROM_ERR_COUNT] = {
     840             :         "ignore",
     841             :         "re",
     842             :         "ue",
     843             : };
     844             : 
     845           0 : static loff_t amdgpu_ras_debugfs_table_size(struct amdgpu_ras_eeprom_control *control)
     846             : {
     847           0 :         return strlen(tbl_hdr_str) + tbl_hdr_fmt_size +
     848           0 :                 strlen(rec_hdr_str) + rec_hdr_fmt_size * control->ras_num_recs;
     849             : }
     850             : 
     851           0 : void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control)
     852             : {
     853           0 :         struct amdgpu_ras *ras = container_of(control, struct amdgpu_ras,
     854             :                                               eeprom_control);
     855           0 :         struct dentry *de = ras->de_ras_eeprom_table;
     856             : 
     857           0 :         if (de)
     858           0 :                 d_inode(de)->i_size = amdgpu_ras_debugfs_table_size(control);
     859           0 : }
     860             : 
     861           0 : static ssize_t amdgpu_ras_debugfs_table_read(struct file *f, char __user *buf,
     862             :                                              size_t size, loff_t *pos)
     863             : {
     864           0 :         struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
     865           0 :         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
     866           0 :         struct amdgpu_ras_eeprom_control *control = &ras->eeprom_control;
     867           0 :         const size_t orig_size = size;
     868           0 :         int res = -EFAULT;
     869             :         size_t data_len;
     870             : 
     871           0 :         mutex_lock(&control->ras_tbl_mutex);
     872             : 
     873             :         /* We want *pos - data_len > 0, which means there's
     874             :          * bytes to be printed from data.
     875             :          */
     876           0 :         data_len = strlen(tbl_hdr_str);
     877           0 :         if (*pos < data_len) {
     878           0 :                 data_len -= *pos;
     879           0 :                 data_len = min_t(size_t, data_len, size);
     880           0 :                 if (copy_to_user(buf, &tbl_hdr_str[*pos], data_len))
     881             :                         goto Out;
     882           0 :                 buf += data_len;
     883           0 :                 size -= data_len;
     884           0 :                 *pos += data_len;
     885             :         }
     886             : 
     887           0 :         data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size;
     888           0 :         if (*pos < data_len && size > 0) {
     889             :                 u8 data[tbl_hdr_fmt_size + 1];
     890             :                 loff_t lpos;
     891             : 
     892           0 :                 snprintf(data, sizeof(data), tbl_hdr_fmt,
     893             :                          control->tbl_hdr.header,
     894             :                          control->tbl_hdr.version,
     895             :                          control->tbl_hdr.first_rec_offset,
     896             :                          control->tbl_hdr.tbl_size,
     897             :                          control->tbl_hdr.checksum);
     898             : 
     899           0 :                 data_len -= *pos;
     900           0 :                 data_len = min_t(size_t, data_len, size);
     901           0 :                 lpos = *pos - strlen(tbl_hdr_str);
     902           0 :                 if (copy_to_user(buf, &data[lpos], data_len))
     903             :                         goto Out;
     904           0 :                 buf += data_len;
     905           0 :                 size -= data_len;
     906           0 :                 *pos += data_len;
     907             :         }
     908             : 
     909           0 :         data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size + strlen(rec_hdr_str);
     910           0 :         if (*pos < data_len && size > 0) {
     911             :                 loff_t lpos;
     912             : 
     913           0 :                 data_len -= *pos;
     914           0 :                 data_len = min_t(size_t, data_len, size);
     915           0 :                 lpos = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size;
     916           0 :                 if (copy_to_user(buf, &rec_hdr_str[lpos], data_len))
     917             :                         goto Out;
     918           0 :                 buf += data_len;
     919           0 :                 size -= data_len;
     920           0 :                 *pos += data_len;
     921             :         }
     922             : 
     923           0 :         data_len = amdgpu_ras_debugfs_table_size(control);
     924           0 :         if (*pos < data_len && size > 0) {
     925             :                 u8 dare[RAS_TABLE_RECORD_SIZE];
     926             :                 u8 data[rec_hdr_fmt_size + 1];
     927             :                 struct eeprom_table_record record;
     928             :                 int s, r;
     929             : 
     930             :                 /* Find the starting record index
     931             :                  */
     932           0 :                 s = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size -
     933           0 :                         strlen(rec_hdr_str);
     934           0 :                 s = s / rec_hdr_fmt_size;
     935           0 :                 r = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size -
     936           0 :                         strlen(rec_hdr_str);
     937           0 :                 r = r % rec_hdr_fmt_size;
     938             : 
     939           0 :                 for ( ; size > 0 && s < control->ras_num_recs; s++) {
     940           0 :                         u32 ai = RAS_RI_TO_AI(control, s);
     941             :                         /* Read a single record
     942             :                          */
     943           0 :                         res = __amdgpu_ras_eeprom_read(control, dare, ai, 1);
     944           0 :                         if (res)
     945             :                                 goto Out;
     946           0 :                         __decode_table_record_from_buf(control, &record, dare);
     947           0 :                         snprintf(data, sizeof(data), rec_hdr_fmt,
     948             :                                  s,
     949           0 :                                  RAS_INDEX_TO_OFFSET(control, ai),
     950           0 :                                  record_err_type_str[record.err_type],
     951           0 :                                  record.bank,
     952             :                                  record.ts,
     953             :                                  record.offset,
     954           0 :                                  record.mem_channel,
     955           0 :                                  record.mcumc_id,
     956             :                                  record.retired_page);
     957             : 
     958           0 :                         data_len = min_t(size_t, rec_hdr_fmt_size - r, size);
     959           0 :                         if (copy_to_user(buf, &data[r], data_len)) {
     960             :                                 res = -EFAULT;
     961             :                                 goto Out;
     962             :                         }
     963           0 :                         buf += data_len;
     964           0 :                         size -= data_len;
     965           0 :                         *pos += data_len;
     966           0 :                         r = 0;
     967             :                 }
     968             :         }
     969             :         res = 0;
     970             : Out:
     971           0 :         mutex_unlock(&control->ras_tbl_mutex);
     972           0 :         return res < 0 ? res : orig_size - size;
     973             : }
     974             : 
     975             : static ssize_t
     976           0 : amdgpu_ras_debugfs_eeprom_table_read(struct file *f, char __user *buf,
     977             :                                      size_t size, loff_t *pos)
     978             : {
     979           0 :         struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
     980           0 :         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
     981           0 :         struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
     982             :         u8 data[81];
     983             :         int res;
     984             : 
     985           0 :         if (!size)
     986           0 :                 return size;
     987             : 
     988           0 :         if (!ras || !control) {
     989           0 :                 res = snprintf(data, sizeof(data), "Not supported\n");
     990           0 :                 if (*pos >= res)
     991             :                         return 0;
     992             : 
     993           0 :                 res -= *pos;
     994           0 :                 res = min_t(size_t, res, size);
     995             : 
     996           0 :                 if (copy_to_user(buf, &data[*pos], res))
     997             :                         return -EFAULT;
     998             : 
     999           0 :                 *pos += res;
    1000             : 
    1001           0 :                 return res;
    1002             :         } else {
    1003           0 :                 return amdgpu_ras_debugfs_table_read(f, buf, size, pos);
    1004             :         }
    1005             : }
    1006             : 
    1007             : const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops = {
    1008             :         .owner = THIS_MODULE,
    1009             :         .read = amdgpu_ras_debugfs_eeprom_table_read,
    1010             :         .write = NULL,
    1011             :         .llseek = default_llseek,
    1012             : };
    1013             : 
    1014             : /**
    1015             :  * __verify_ras_table_checksum -- verify the RAS EEPROM table checksum
    1016             :  * @control: pointer to control structure
    1017             :  *
    1018             :  * Check the checksum of the stored in EEPROM RAS table.
    1019             :  *
    1020             :  * Return 0 if the checksum is correct,
    1021             :  * positive if it is not correct, and
    1022             :  * -errno on I/O error.
    1023             :  */
    1024           0 : static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control)
    1025             : {
    1026           0 :         struct amdgpu_device *adev = to_amdgpu_device(control);
    1027             :         int buf_size, res;
    1028             :         u8  csum, *buf, *pp;
    1029             : 
    1030           0 :         buf_size = RAS_TABLE_HEADER_SIZE +
    1031           0 :                 control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
    1032           0 :         buf = kzalloc(buf_size, GFP_KERNEL);
    1033           0 :         if (!buf) {
    1034           0 :                 DRM_ERROR("Out of memory checking RAS table checksum.\n");
    1035           0 :                 return -ENOMEM;
    1036             :         }
    1037             : 
    1038           0 :         res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
    1039           0 :                                  control->i2c_address +
    1040           0 :                                  control->ras_header_offset,
    1041             :                                  buf, buf_size);
    1042           0 :         if (res < buf_size) {
    1043           0 :                 DRM_ERROR("Partial read for checksum, res:%d\n", res);
    1044             :                 /* On partial reads, return -EIO.
    1045             :                  */
    1046           0 :                 if (res >= 0)
    1047           0 :                         res = -EIO;
    1048             :                 goto Out;
    1049             :         }
    1050             : 
    1051             :         csum = 0;
    1052           0 :         for (pp = buf; pp < buf + buf_size; pp++)
    1053           0 :                 csum += *pp;
    1054             : Out:
    1055           0 :         kfree(buf);
    1056           0 :         return res < 0 ? res : csum;
    1057             : }
    1058             : 
    1059           0 : int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
    1060             :                            bool *exceed_err_limit)
    1061             : {
    1062           0 :         struct amdgpu_device *adev = to_amdgpu_device(control);
    1063           0 :         unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 };
    1064           0 :         struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
    1065           0 :         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
    1066             :         int res;
    1067             : 
    1068           0 :         *exceed_err_limit = false;
    1069             : 
    1070           0 :         if (!__is_ras_eeprom_supported(adev))
    1071             :                 return 0;
    1072             : 
    1073             :         /* Verify i2c adapter is initialized */
    1074           0 :         if (!adev->pm.ras_eeprom_i2c_bus || !adev->pm.ras_eeprom_i2c_bus->algo)
    1075             :                 return -ENOENT;
    1076             : 
    1077           0 :         if (!__get_eeprom_i2c_addr(adev, control))
    1078             :                 return -EINVAL;
    1079             : 
    1080           0 :         control->ras_header_offset = RAS_HDR_START;
    1081           0 :         control->ras_record_offset = RAS_RECORD_START;
    1082           0 :         control->ras_max_record_count  = RAS_MAX_RECORD_COUNT;
    1083           0 :         mutex_init(&control->ras_tbl_mutex);
    1084             : 
    1085             :         /* Read the table header from EEPROM address */
    1086           0 :         res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
    1087           0 :                                  control->i2c_address + control->ras_header_offset,
    1088             :                                  buf, RAS_TABLE_HEADER_SIZE);
    1089           0 :         if (res < RAS_TABLE_HEADER_SIZE) {
    1090           0 :                 DRM_ERROR("Failed to read EEPROM table header, res:%d", res);
    1091           0 :                 return res >= 0 ? -EIO : res;
    1092             :         }
    1093             : 
    1094           0 :         __decode_table_header_from_buf(hdr, buf);
    1095             : 
    1096           0 :         control->ras_num_recs = RAS_NUM_RECS(hdr);
    1097           0 :         control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset);
    1098             : 
    1099           0 :         if (hdr->header == RAS_TABLE_HDR_VAL) {
    1100           0 :                 DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
    1101             :                                  control->ras_num_recs);
    1102           0 :                 res = __verify_ras_table_checksum(control);
    1103           0 :                 if (res)
    1104           0 :                         DRM_ERROR("RAS table incorrect checksum or error:%d\n",
    1105             :                                   res);
    1106             : 
    1107             :                 /* Warn if we are at 90% of the threshold or above
    1108             :                  */
    1109           0 :                 if (10 * control->ras_num_recs >= 9 * ras->bad_page_cnt_threshold)
    1110           0 :                         dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
    1111             :                                         control->ras_num_recs,
    1112             :                                         ras->bad_page_cnt_threshold);
    1113           0 :         } else if (hdr->header == RAS_TABLE_HDR_BAD &&
    1114           0 :                    amdgpu_bad_page_threshold != 0) {
    1115           0 :                 res = __verify_ras_table_checksum(control);
    1116           0 :                 if (res)
    1117           0 :                         DRM_ERROR("RAS Table incorrect checksum or error:%d\n",
    1118             :                                   res);
    1119           0 :                 if (ras->bad_page_cnt_threshold > control->ras_num_recs) {
    1120             :                         /* This means that, the threshold was increased since
    1121             :                          * the last time the system was booted, and now,
    1122             :                          * ras->bad_page_cnt_threshold - control->num_recs > 0,
    1123             :                          * so that at least one more record can be saved,
    1124             :                          * before the page count threshold is reached.
    1125             :                          */
    1126           0 :                         dev_info(adev->dev,
    1127             :                                  "records:%d threshold:%d, resetting "
    1128             :                                  "RAS table header signature",
    1129             :                                  control->ras_num_recs,
    1130             :                                  ras->bad_page_cnt_threshold);
    1131           0 :                         res = amdgpu_ras_eeprom_correct_header_tag(control,
    1132             :                                                                    RAS_TABLE_HDR_VAL);
    1133             :                 } else {
    1134           0 :                         dev_err(adev->dev, "RAS records:%d exceed threshold:%d",
    1135             :                                 control->ras_num_recs, ras->bad_page_cnt_threshold);
    1136           0 :                         if (amdgpu_bad_page_threshold == -2) {
    1137           0 :                                 dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -2.");
    1138           0 :                                 res = 0;
    1139             :                         } else {
    1140           0 :                                 *exceed_err_limit = true;
    1141           0 :                                 dev_err(adev->dev,
    1142             :                                         "RAS records:%d exceed threshold:%d, "
    1143             :                                         "GPU will not be initialized. Replace this GPU or increase the threshold",
    1144             :                                         control->ras_num_recs, ras->bad_page_cnt_threshold);
    1145             :                         }
    1146             :                 }
    1147             :         } else {
    1148           0 :                 DRM_INFO("Creating a new EEPROM table");
    1149             : 
    1150           0 :                 res = amdgpu_ras_eeprom_reset_table(control);
    1151             :         }
    1152             : 
    1153           0 :         return res < 0 ? res : 0;
    1154             : }

Generated by: LCOV version 1.14