From fe7f819bc0fc62e6a5a70d690278d9a0e58a78fc Mon Sep 17 00:00:00 2001
From: chrfranke <chrfranke@4ea69e1a-61f1-4043-bf83-b5c94c648137>
Date: Tue, 30 May 2023 11:55:00 +0000
Subject: [PATCH] smartd.cpp: Don't report new non-device related errors as
 critical (#1222). smartd.conf.5.in: Document new functionality.

git-svn-id: https://svn.code.sf.net/p/smartmontools/code/trunk@5472 4ea69e1a-61f1-4043-bf83-b5c94c648137
---
 smartmontools/ChangeLog        |  6 +++
 smartmontools/smartd.conf.5.in | 14 +++++
 smartmontools/smartd.cpp       | 93 ++++++++++++++++++++++++++++++----
 3 files changed, 103 insertions(+), 10 deletions(-)

diff --git a/smartmontools/ChangeLog b/smartmontools/ChangeLog
index 68e79ff73..0232014ed 100644
--- a/smartmontools/ChangeLog
+++ b/smartmontools/ChangeLog
@@ -1,5 +1,11 @@
 $Id$
 
+2023-05-30  Christian Franke  <franke@computer.org>
+
+	smartd.cpp: Don't report new non-device related errors as critical
+	(#1222).
+	smartd.conf.5.in: Document new functionality.
+
 2023-05-29  Christian Franke  <franke@computer.org>
 
 	Add error messages for NVMe status values.
diff --git a/smartmontools/smartd.conf.5.in b/smartmontools/smartd.conf.5.in
index 9e2bb33ba..fe8e1e32f 100644
--- a/smartmontools/smartd.conf.5.in
+++ b/smartmontools/smartd.conf.5.in
@@ -740,6 +740,20 @@ error log has increased since the last check.
 .I error
 \- [NVMe] report if the "Number of Error Information Log Entries" from the
 SMART/Health Information log has increased since the last check.
+.br
+[NEW EXPERIMENTAL SMARTD FEATURE]
+This will only be logged as LOG_CRIT if at least one of the new errors is
+still present in the Error Information log and its status indicates a
+device related error.
+Up to eight of the most recent of these errors are logged as LOG_INFO then.
+This is useful because the NVMe Error Information log is not persistent
+across power cycles or device resets.
+.br
+If all new errors are either no longer present in the log or are not device
+related (e.g. invalid command, invalid field in command, ...), a LOG_INFO
+message is generated instead.
+This avoids misleading warnings if the operating system issues unsupported
+commands and the device firmware also logs these kind of errors.
 .Sp
 .\" %ENDIF OS Darwin FreeBSD Linux NetBSD Windows Cygwin
 .I xerror
diff --git a/smartmontools/smartd.cpp b/smartmontools/smartd.cpp
index 7790249dc..35c9ea5bf 100644
--- a/smartmontools/smartd.cpp
+++ b/smartmontools/smartd.cpp
@@ -2,7 +2,7 @@
  * Home page of code is: https://www.smartmontools.org
  *
  * Copyright (C) 2002-11 Bruce Allen
- * Copyright (C) 2008-22 Christian Franke
+ * Copyright (C) 2008-23 Christian Franke
  * Copyright (C) 2000    Michael Cornwell <cornwell@acm.org>
  * Copyright (C) 2008    Oliver Bock <brevilo@users.sourceforge.net>
  *
@@ -448,6 +448,9 @@ struct dev_config
   attribute_flags monitor_attr_flags;     // MONITOR_* flags for each attribute
 
   ata_vendor_attr_defs attribute_defs;    // -v options
+
+  // NVMe only
+  unsigned nvme_err_log_max_entries{};    // size of error log
 };
 
 // Number of allowed mail message types
@@ -2678,6 +2681,74 @@ static int nvme_get_max_temp_kelvin(const nvme_smart_log & smart_log)
   return k;
 }
 
+// Check the NVMe Error Information log for device related errors.
+static bool check_nvme_error_log(const dev_config & cfg, dev_state & state, nvme_device * nvmedev,
+  uint64_t newcnt = 0)
+{
+  // Limit transfer size to one page (64 entries) to avoid problems with
+  // limits of NVMe pass-through layer or too low MDTS values.
+  unsigned want_entries = 64;
+  if (want_entries > cfg.nvme_err_log_max_entries)
+    want_entries = cfg.nvme_err_log_max_entries;
+  raw_buffer error_log_buf(want_entries * sizeof(nvme_error_log_page));
+  nvme_error_log_page * error_log =
+    reinterpret_cast<nvme_error_log_page *>(error_log_buf.data());
+  unsigned read_entries = nvme_read_error_log(nvmedev, error_log, want_entries, false /*!lpo_sup*/);
+  if (!read_entries) {
+    PrintOut(LOG_INFO, "Device: %s, Read %u entries from Error Information Log failed\n",
+      cfg.name.c_str(), want_entries);
+    return false;
+  }
+
+  if (!newcnt)
+    return true; // Support check only
+
+  // Scan log, find device related errors
+  uint64_t oldcnt = state.nvme_err_log_entries, mincnt = newcnt;
+  int err = 0, ign = 0;
+  for (unsigned i = 0; i < read_entries; i++) {
+    const nvme_error_log_page & e = error_log[i];
+    if (!e.error_count)
+      continue; // unused
+    if (e.error_count <= oldcnt)
+      break; // stop on first old entry
+    if (e.error_count < mincnt)
+      mincnt = e.error_count; // min known error
+    if (e.error_count > newcnt)
+      newcnt = e.error_count; // adjust maximum
+    uint16_t status = e.status_field >> 1;
+    if (!nvme_status_is_error(status) || nvme_status_to_errno(status) == EINVAL) {
+      ign++; // Not a device related error
+      continue;
+    }
+
+    // Log the most recent 8 errors
+    if (++err > 8)
+      continue;
+    char buf[64];
+    PrintOut(LOG_INFO, "Device: %s, NVMe error [%u], count %" PRIu64 ", status 0x%04x: %s\n",
+      cfg.name.c_str(), i, e.error_count, e.status_field,
+      nvme_status_to_info_str(buf, e.status_field >> 1));
+  }
+
+  std::string msg = strprintf("Device: %s, NVMe error count increased from %" PRIu64 " to %" PRIu64
+                              " (%d new, %d ignored, %" PRIu64 " unknown)",
+                              cfg.name.c_str(), oldcnt, newcnt, err, ign,
+                              (mincnt > oldcnt + 1 ? mincnt - oldcnt - 1 : 0));
+  // LOG_CRIT only if device related errors are found
+  if (!err) {
+    PrintOut(LOG_INFO, "%s\n", msg.c_str());
+  }
+  else {
+    PrintOut(LOG_CRIT, "%s\n", msg.c_str());
+    MailWarning(cfg, state, 4, "%s", msg.c_str());
+  }
+
+  state.nvme_err_log_entries = newcnt;
+  state.must_write = true;
+  return true;
+}
+
 static int NVMeDeviceScan(dev_config & cfg, dev_state & state, nvme_device * nvmedev,
                           const dev_config_vector * prev_cfgs)
 {
@@ -2739,8 +2810,14 @@ static int NVMeDeviceScan(dev_config & cfg, dev_state & state, nvme_device * nvm
   }
 
   // Init total error count
+  cfg.nvme_err_log_max_entries = id_ctrl.elpe + 1; // 0's based value
   if (cfg.errorlog || cfg.xerrorlog) {
-    state.nvme_err_log_entries = le128_to_uint64(smart_log.num_err_log_entries);
+    if (!check_nvme_error_log(cfg, state, nvmedev)) {
+      PrintOut(LOG_INFO, "Device: %s, Error Information unavailable, ignoring -l [x]error\n", name);
+      cfg.errorlog = cfg.xerrorlog = false;
+    }
+    else
+      state.nvme_err_log_entries = le128_to_uint64(smart_log.num_err_log_entries);
   }
 
   // If no supported tests selected, return
@@ -3854,16 +3931,12 @@ static int NVMeCheckDevice(const dev_config & cfg, dev_state & state, nvme_devic
 
   // Check if number of errors has increased
   if (cfg.errorlog || cfg.xerrorlog) {
-    uint64_t oldcnt = state.nvme_err_log_entries;
     uint64_t newcnt = le128_to_uint64(smart_log.num_err_log_entries);
-    if (newcnt > oldcnt) {
-      PrintOut(LOG_CRIT, "Device: %s, number of Error Log entries increased from %" PRIu64 " to %" PRIu64 "\n",
-               name, oldcnt, newcnt);
-      MailWarning(cfg, state, 4, "Device: %s, number of Error Log entries increased from %" PRIu64 " to %" PRIu64,
-                  name, oldcnt, newcnt);
-      state.must_write = true;
+    if (newcnt > state.nvme_err_log_entries) {
+      // Warn only if device related errors are found
+      check_nvme_error_log(cfg, state, nvmedev, newcnt);
     }
-    state.nvme_err_log_entries = newcnt;
+    // else // TODO: Handle decrease of count?
   }
 
   CloseDevice(nvmedev, name);
-- 
GitLab