diff --git a/sm5/CHANGELOG b/sm5/CHANGELOG index 879411a717048772d9e707231fa618dbd067df5d..0868e0fc98099f2e791c1459f383896ddd1f1d8f 100644 --- a/sm5/CHANGELOG +++ b/sm5/CHANGELOG @@ -1,6 +1,6 @@ CHANGELOG for smartmontools -$Id: CHANGELOG,v 1.254 2003/11/14 05:31:17 dpgilbert Exp $ +$Id: CHANGELOG,v 1.255 2003/11/14 07:41:39 ballen4705 Exp $ Maintainers / Developers Key: [BA] Bruce Allen @@ -23,6 +23,10 @@ CURRENT DEVELOPMENT VERSION (see VERSION file in this directory): <ADDITIONS TO THE CHANGE LOG SHOULD BE ADDED HERE, PLEASE> + [BA] smartd: for both SCSI and ATA now warns user if either + the number of self-test errors OR timestamp of most + recent self-test error have increased. + [DG] smartctl: output Seagate scsi Cache and Factory log pages (if available) when vendor attributes chosen diff --git a/sm5/smartd.8.in b/sm5/smartd.8.in index bb82aa93f22cfeaa307c76f8aa8bbba336716a10..964681e685c2467e749bd21648e88238e59a0501 100644 --- a/sm5/smartd.8.in +++ b/sm5/smartd.8.in @@ -1,7 +1,7 @@ .ig Copyright (C) 2002-3 Bruce Allen <smartmontools-support@lists.sourceforge.net> -$Id: smartd.8.in,v 1.14 2003/11/10 19:13:03 ballen4705 Exp $ +$Id: smartd.8.in,v 1.15 2003/11/14 07:41:39 ballen4705 Exp $ This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -17,7 +17,7 @@ Cornwell at the Concurrent Systems Laboratory (now part of the Storage Systems Research Center), Jack Baskin School of Engineering, University of California, Santa Cruz. http://ssrc.soe.ucsc.edu/ .. -.TH SMARTD 8 "$Date: 2003/11/10 19:13:03 $" RELEASE +.TH SMARTD 8 "$Date: 2003/11/14 07:41:39 $" RELEASE .SH NAME \fBsmartd\fP \- SMART Disk Monitoring Daemon @@ -28,7 +28,7 @@ University of California, Santa Cruz. http://ssrc.soe.ucsc.edu/ .B /usr/sbin/smartd .SH PACKAGE VERSION -\fBRELEASE\fP "$Date: 2003/11/10 19:13:03 $" +\fBRELEASE\fP "$Date: 2003/11/14 07:41:39 $" .SH DESCRIPTION \fBsmartd\fP is a daemon that monitors the Self-Monitoring, Analysis @@ -444,12 +444,13 @@ if no Directives appear, then the device will be monitored as if the \'\-a\' Directive (monitor all SMART properties) had been given. .B If a SCSI disk is listed, -it will be monitored at the only implemented level: roughly equivalent -to using the \'\-H\' option for an ATA disk. So with the exception of -\'\-d\', \'\-m\', and \'\-M\', the Directives below are ignored for SCSI -disks. For SCSI disks, the \'\-m\' Directive sends a warning email if -the SMART status indicates a disk failure or problem, or if the SCSI -inquiry about disk status fails. +it will be monitored at the maximum implemented level: roughly +equivalent to using the \'\-H \-l selftest\' options for an ATA disk. +So with the exception of \'\-d\', \'\-m\', \'\-l selftest\', and +\'\-M\', the Directives below are ignored for SCSI disks. For SCSI +disks, the \'\-m\' Directive sends a warning email if the SMART status +indicates a disk failure or problem, or if the SCSI inquiry about disk +status fails. .B If a 3ware controller is used then the corresponding SCSI device must be listed, along with the @@ -578,22 +579,17 @@ valid arguments to this Directive are: increased since the last check. .I selftest -\- report if that the number of errors reported in the SMART Self-Test Log -has increased since the last check. Note that such errors will -.B only -be logged if you run self-tests on the disk (and it fails the tests!). -[Self-Tests can be run by using the -.B \'\-t\ short\' -and -.B \'\-t\ long\' -options of -.B smartctl -and the results of the testing can be observed using the -.B smartctl \'\-l\ selftest\' -command-line option.] +\- report if the number of failed tests reported in the SMART +Self-Test Log has increased since the last check, or if the timestamp +associated with the more recent failed test has increased. Note that +such errors will \fBonly\fP be logged if you run self-tests on the +disk (and it fails a test!). [Self-Tests can be run by using the +\fB\'\-t\ short\'\fP and \fB\'\-t\ long\'\fP options of \fBsmartctl\fP +and the results of the testing can be observed using the \fBsmartctl +\'\-l\ selftest\'\fP command-line option.] [Please see the -.B smartctl \-l +\fBsmartctl \-l\fP command-line option.] .TP .B \-f @@ -1355,4 +1351,4 @@ smartmontools home page at \fBhttp://smartmontools.sourceforge.net/\fP . .SH CVS ID OF THIS PAGE: -$Id: smartd.8.in,v 1.14 2003/11/10 19:13:03 ballen4705 Exp $ +$Id: smartd.8.in,v 1.15 2003/11/14 07:41:39 ballen4705 Exp $ diff --git a/sm5/smartd.c b/sm5/smartd.c index 4b57fcaf5f4210e04de632f811e09bac48e9e9c4..827e6f7f2fe01e6c46111a46550a6e3d998290c8 100644 --- a/sm5/smartd.c +++ b/sm5/smartd.c @@ -65,7 +65,7 @@ extern const char *atacmdnames_c_cvsid, *atacmds_c_cvsid, *ataprint_c_cvsid, *escalade_c_cvsid, *knowndrives_c_cvsid, *os_XXXX_c_cvsid, *scsicmds_c_cvsid, *utility_c_cvsid; -const char *smartd_c_cvsid="$Id: smartd.c,v 1.235 2003/11/13 07:43:22 dpgilbert Exp $" +const char *smartd_c_cvsid="$Id: smartd.c,v 1.236 2003/11/14 07:41:39 ballen4705 Exp $" ATACMDS_H_CVSID ATAPRINT_H_CVSID CONFIG_H_CVSID EXTERN_H_CVSID KNOWNDRIVES_H_CVSID SCSICMDS_H_CVSID SMARTD_H_CVSID UTILITY_H_CVSID; @@ -901,18 +901,24 @@ int ATADeviceScan(cfgfile *cfg){ } // capability check: self-test-log - if (cfg->selftest){ + if (cfg->selftest){ + int retval; // see if device supports Self-test logging. Note that the // following is not a typo: Device supports self-test log if and // only if it also supports error log. if ( !cfg->smartval || !isSmartErrorLogCapable(cfg->smartval) || - (cfg->selflogcount=SelfTestErrorCount(fd, name))<0 + (retval=SelfTestErrorCount(fd, name))<0 ) { PrintOut(LOG_INFO, "Device: %s, does not support SMART Self-test Log.\n", name); cfg->selftest=0; cfg->selflogcount=0; + cfg->selfloghour=0; + } + else { + cfg->selflogcount=SELFTEST_ERRORCOUNT(retval); + cfg->selfloghour =SELFTEST_ERRORHOURS(retval); } } @@ -1094,10 +1100,16 @@ static int SCSIDeviceScan(cfgfile *cfg) // capability check: self-test-log if (cfg->selftest){ - if ((cfg->selflogcount=scsiCountFailedSelfTests(fd, 1))<0) { + int retval=scsiCountFailedSelfTests(fd, 1); + if (retval<0) { PrintOut(LOG_INFO, "Device: %s, does not support SMART Self-test Log.\n", device); cfg->selftest=0; cfg->selflogcount=0; + cfg->selfloghour=0; + } + else { + cfg->selflogcount=SELFTEST_ERRORCOUNT(retval); + cfg->selfloghour =SELFTEST_ERRORHOURS(retval); } } @@ -1212,6 +1224,45 @@ int IsAttributeOff(unsigned char attr, unsigned char **datap, int set, int which return 0; } +// If the self-test log has got more self-test errors (or more recent +// self-test errors) recorded, then notify user. +void CheckSelfTestLogs(cfgfile *cfg, int new){ + char *name=cfg->name; + + if (new<0) + // command failed + PrintAndMail(cfg, 8, LOG_CRIT, "Device: %s, Read SMART Self Test Log Failed", name); + else { + // old and new error counts + int oldc=cfg->selflogcount; + int newc=SELFTEST_ERRORCOUNT(new); + + // old and new error timestamps in hours + int oldh=cfg->selfloghour; + int newh=SELFTEST_ERRORHOURS(new); + + if (oldc<newc) { + // increase in error count + PrintOut(LOG_CRIT, "Device: %s, Self-Test Log error count increased from %d to %d\n", + name, oldc, newc); + PrintAndMail(cfg, 3, LOG_CRIT, "Device: %s, Self-Test Log error count increased from %d to %d", + name, oldc, newc); + } else if (oldh<newh) { + // more recent error + PrintOut(LOG_CRIT, "Device: %s, new Self-Test Log error at hour timestamp %d\n", + name, newh); + PrintAndMail(cfg, 3, LOG_CRIT, "Device: %s, new Self-Test Log error at hour timestamp %d\n", + name, newh); + } + + // Needed since self-test error count may DECREASE. Hour should + // never decrease but this does no harm. + cfg->selflogcount= newc; + cfg->selfloghour = newh; + } + return; +} + int ATACheckDevice(cfgfile *cfg){ int fd,i; @@ -1333,39 +1384,8 @@ int ATACheckDevice(cfgfile *cfg){ } // check if number of selftest errors has increased (note: may also DECREASE) - if (cfg->selftest){ - int old=cfg->selflogcount; - int new=SelfTestErrorCount(fd, name); - - // old and new error counts - int oldc=old & 0xff; - int newc=new & 0xff; - - // old and new error timestamps in hours - int oldh=old>>8; - int newh=new>>8; - - if (new<0) - // command failed - PrintAndMail(cfg, 8, LOG_CRIT, "Device: %s, Read SMART Self Test Log Failed", name); - else if (oldc<newc) { - // increase in error count - PrintOut(LOG_CRIT, "Device: %s, Self-Test Log error count increased from %d to %d\n", - name, oldc, newc); - PrintAndMail(cfg, 3, LOG_CRIT, "Device: %s, Self-Test Log error count increased from %d to %d", - name, oldc, newc); - } else if (oldh<newh) { - // more recent error - PrintOut(LOG_CRIT, "Device: %s, new Self-Test Log error at hour timestamp %d\n", - name, newh); - PrintAndMail(cfg, 3, LOG_CRIT, "Device: %s, new Self-Test Log error at hour timestamp %d\n", - name, newh); - } - - // Needed since self-test error count may DECREASE - if (new>=0) - cfg->selflogcount=new; - } + if (cfg->selftest) + CheckSelfTestLogs(cfg, SelfTestErrorCount(fd, name)); // check if number of ATA errors has increased if (cfg->errorlog){ @@ -1464,26 +1484,9 @@ int SCSICheckDevice(cfgfile *cfg) } // check if number of selftest errors has increased (note: may also DECREASE) - if (cfg->selftest){ - // old and new self-test error counts - int old=cfg->selflogcount; - int new=scsiCountFailedSelfTests(fd, 0); - - if (new<0) - // command failed - PrintAndMail(cfg, 8, LOG_CRIT, "Device: %s, Read SMART Self Test Log Failed", name); - else if (old<new){ - PrintOut(LOG_CRIT, "Device: %s, Self-Test Log error count increased from %d to %d\n", - name, old, new); - PrintAndMail(cfg, 3, LOG_CRIT, "Device: %s, Self-Test Log error count increased from %d to %d", - name, old, new); - } - - // Needed since self-test error count may DECREASE - if (new>=0) - cfg->selflogcount=new; - } - + if (cfg->selftest) + CheckSelfTestLogs(cfg, scsiCountFailedSelfTests(fd, 0)); + CloseDevice(fd, name); return 0; } diff --git a/sm5/smartd.conf.5.in b/sm5/smartd.conf.5.in index eff870d90556f963f57b9a1f356727a1887cfb87..d26ebd983a700340f294049811736a9e580c8f37 100644 --- a/sm5/smartd.conf.5.in +++ b/sm5/smartd.conf.5.in @@ -1,7 +1,7 @@ .ig Copyright (C) 2002-3 Bruce Allen <smartmontools-support@lists.sourceforge.net> -$Id: smartd.conf.5.in,v 1.9 2003/10/21 12:00:53 arvoreen Exp $ +$Id: smartd.conf.5.in,v 1.10 2003/11/14 07:41:40 ballen4705 Exp $ This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free @@ -17,11 +17,11 @@ at the Concurrent Systems Laboratory (now part of the Storage Systems Research Center), Jack Baskin School of Engineering, University of California, Santa Cruz. http://ssrc.soe.ucsc.edu/ .. -.TH SMARTD.CONF 5 "$Date: 2003/10/21 12:00:53 $" RELEASE +.TH SMARTD.CONF 5 "$Date: 2003/11/14 07:41:40 $" RELEASE .SH NAME \fBsmartd.conf\fP \- SMART Disk Monitoring Daemon Configuration File \- \fBRELEASE\fP .SH PACKAGE VERSION -RELEASE "$Date: 2003/10/21 12:00:53 $" +RELEASE "$Date: 2003/11/14 07:41:40 $" .SH DESCRIPTION \fB/etc/smartd.conf\fP is the configuration file for the \fBsmartd\fP @@ -174,12 +174,13 @@ if no Directives appear, then the device will be monitored as if the \'\-a\' Directive (monitor all SMART properties) had been given. .B If a SCSI disk is listed, -it will be monitored at the only implemented level: roughly equivalent -to using the \'\-H\' option for an ATA disk. So with the exception of -\'\-d\', \'\-m\', and \'\-M\', the Directives below are ignored for SCSI -disks. For SCSI disks, the \'\-m\' Directive sends a warning email if -the SMART status indicates a disk failure or problem, or if the SCSI -inquiry about disk status fails. +it will be monitored at the maximum implemented level: roughly +equivalent to using the \'\-H \-l selftest\' options for an ATA disk. +So with the exception of \'\-d\', \'\-m\', \'\-l selftest\', and +\'\-M\', the Directives below are ignored for SCSI disks. For SCSI +disks, the \'\-m\' Directive sends a warning email if the SMART status +indicates a disk failure or problem, or if the SCSI inquiry about disk +status fails. .B If a 3ware controller is used then the corresponding SCSI device must be listed, along with the @@ -308,22 +309,17 @@ valid arguments to this Directive are: increased since the last check. .I selftest -\- report if that the number of errors reported in the SMART Self-Test Log -has increased since the last check. Note that such errors will -.B only -be logged if you run self-tests on the disk (and it fails the tests!). -[Self-Tests can be run by using the -.B \'\-t\ short\' -and -.B \'\-t\ long\' -options of -.B smartctl -and the results of the testing can be observed using the -.B smartctl \'\-l\ selftest\' -command-line option.] +\- report if the number of failed tests reported in the SMART +Self-Test Log has increased since the last check, or if the timestamp +associated with the more recent failed test has increased. Note that +such errors will \fBonly\fP be logged if you run self-tests on the +disk (and it fails a test!). [Self-Tests can be run by using the +\fB\'\-t\ short\'\fP and \fB\'\-t\ long\'\fP options of \fBsmartctl\fP +and the results of the testing can be observed using the \fBsmartctl +\'\-l\ selftest\'\fP command-line option.] [Please see the -.B smartctl \-l +\fBsmartctl \-l\fP command-line option.] .TP .B \-f @@ -954,4 +950,4 @@ SEE ALSO: .SH CVS ID OF THIS PAGE: -$Id: smartd.conf.5.in,v 1.9 2003/10/21 12:00:53 arvoreen Exp $ +$Id: smartd.conf.5.in,v 1.10 2003/11/14 07:41:40 ballen4705 Exp $ diff --git a/sm5/smartd.cpp b/sm5/smartd.cpp index ed41d0e4209fe0e5bb3eb9138ec98e4a1cdd7c75..a9c4eacca1b589885712aac6c3b12968af43f422 100644 --- a/sm5/smartd.cpp +++ b/sm5/smartd.cpp @@ -65,7 +65,7 @@ extern const char *atacmdnames_c_cvsid, *atacmds_c_cvsid, *ataprint_c_cvsid, *escalade_c_cvsid, *knowndrives_c_cvsid, *os_XXXX_c_cvsid, *scsicmds_c_cvsid, *utility_c_cvsid; -const char *smartd_c_cvsid="$Id: smartd.cpp,v 1.235 2003/11/13 07:43:22 dpgilbert Exp $" +const char *smartd_c_cvsid="$Id: smartd.cpp,v 1.236 2003/11/14 07:41:39 ballen4705 Exp $" ATACMDS_H_CVSID ATAPRINT_H_CVSID CONFIG_H_CVSID EXTERN_H_CVSID KNOWNDRIVES_H_CVSID SCSICMDS_H_CVSID SMARTD_H_CVSID UTILITY_H_CVSID; @@ -901,18 +901,24 @@ int ATADeviceScan(cfgfile *cfg){ } // capability check: self-test-log - if (cfg->selftest){ + if (cfg->selftest){ + int retval; // see if device supports Self-test logging. Note that the // following is not a typo: Device supports self-test log if and // only if it also supports error log. if ( !cfg->smartval || !isSmartErrorLogCapable(cfg->smartval) || - (cfg->selflogcount=SelfTestErrorCount(fd, name))<0 + (retval=SelfTestErrorCount(fd, name))<0 ) { PrintOut(LOG_INFO, "Device: %s, does not support SMART Self-test Log.\n", name); cfg->selftest=0; cfg->selflogcount=0; + cfg->selfloghour=0; + } + else { + cfg->selflogcount=SELFTEST_ERRORCOUNT(retval); + cfg->selfloghour =SELFTEST_ERRORHOURS(retval); } } @@ -1094,10 +1100,16 @@ static int SCSIDeviceScan(cfgfile *cfg) // capability check: self-test-log if (cfg->selftest){ - if ((cfg->selflogcount=scsiCountFailedSelfTests(fd, 1))<0) { + int retval=scsiCountFailedSelfTests(fd, 1); + if (retval<0) { PrintOut(LOG_INFO, "Device: %s, does not support SMART Self-test Log.\n", device); cfg->selftest=0; cfg->selflogcount=0; + cfg->selfloghour=0; + } + else { + cfg->selflogcount=SELFTEST_ERRORCOUNT(retval); + cfg->selfloghour =SELFTEST_ERRORHOURS(retval); } } @@ -1212,6 +1224,45 @@ int IsAttributeOff(unsigned char attr, unsigned char **datap, int set, int which return 0; } +// If the self-test log has got more self-test errors (or more recent +// self-test errors) recorded, then notify user. +void CheckSelfTestLogs(cfgfile *cfg, int new){ + char *name=cfg->name; + + if (new<0) + // command failed + PrintAndMail(cfg, 8, LOG_CRIT, "Device: %s, Read SMART Self Test Log Failed", name); + else { + // old and new error counts + int oldc=cfg->selflogcount; + int newc=SELFTEST_ERRORCOUNT(new); + + // old and new error timestamps in hours + int oldh=cfg->selfloghour; + int newh=SELFTEST_ERRORHOURS(new); + + if (oldc<newc) { + // increase in error count + PrintOut(LOG_CRIT, "Device: %s, Self-Test Log error count increased from %d to %d\n", + name, oldc, newc); + PrintAndMail(cfg, 3, LOG_CRIT, "Device: %s, Self-Test Log error count increased from %d to %d", + name, oldc, newc); + } else if (oldh<newh) { + // more recent error + PrintOut(LOG_CRIT, "Device: %s, new Self-Test Log error at hour timestamp %d\n", + name, newh); + PrintAndMail(cfg, 3, LOG_CRIT, "Device: %s, new Self-Test Log error at hour timestamp %d\n", + name, newh); + } + + // Needed since self-test error count may DECREASE. Hour should + // never decrease but this does no harm. + cfg->selflogcount= newc; + cfg->selfloghour = newh; + } + return; +} + int ATACheckDevice(cfgfile *cfg){ int fd,i; @@ -1333,39 +1384,8 @@ int ATACheckDevice(cfgfile *cfg){ } // check if number of selftest errors has increased (note: may also DECREASE) - if (cfg->selftest){ - int old=cfg->selflogcount; - int new=SelfTestErrorCount(fd, name); - - // old and new error counts - int oldc=old & 0xff; - int newc=new & 0xff; - - // old and new error timestamps in hours - int oldh=old>>8; - int newh=new>>8; - - if (new<0) - // command failed - PrintAndMail(cfg, 8, LOG_CRIT, "Device: %s, Read SMART Self Test Log Failed", name); - else if (oldc<newc) { - // increase in error count - PrintOut(LOG_CRIT, "Device: %s, Self-Test Log error count increased from %d to %d\n", - name, oldc, newc); - PrintAndMail(cfg, 3, LOG_CRIT, "Device: %s, Self-Test Log error count increased from %d to %d", - name, oldc, newc); - } else if (oldh<newh) { - // more recent error - PrintOut(LOG_CRIT, "Device: %s, new Self-Test Log error at hour timestamp %d\n", - name, newh); - PrintAndMail(cfg, 3, LOG_CRIT, "Device: %s, new Self-Test Log error at hour timestamp %d\n", - name, newh); - } - - // Needed since self-test error count may DECREASE - if (new>=0) - cfg->selflogcount=new; - } + if (cfg->selftest) + CheckSelfTestLogs(cfg, SelfTestErrorCount(fd, name)); // check if number of ATA errors has increased if (cfg->errorlog){ @@ -1464,26 +1484,9 @@ int SCSICheckDevice(cfgfile *cfg) } // check if number of selftest errors has increased (note: may also DECREASE) - if (cfg->selftest){ - // old and new self-test error counts - int old=cfg->selflogcount; - int new=scsiCountFailedSelfTests(fd, 0); - - if (new<0) - // command failed - PrintAndMail(cfg, 8, LOG_CRIT, "Device: %s, Read SMART Self Test Log Failed", name); - else if (old<new){ - PrintOut(LOG_CRIT, "Device: %s, Self-Test Log error count increased from %d to %d\n", - name, old, new); - PrintAndMail(cfg, 3, LOG_CRIT, "Device: %s, Self-Test Log error count increased from %d to %d", - name, old, new); - } - - // Needed since self-test error count may DECREASE - if (new>=0) - cfg->selflogcount=new; - } - + if (cfg->selftest) + CheckSelfTestLogs(cfg, scsiCountFailedSelfTests(fd, 0)); + CloseDevice(fd, name); return 0; } diff --git a/sm5/smartd.h b/sm5/smartd.h index 201f2c95e5965026ffc746c011f4493c6229a13f..2b4f22e5e0dae4ab4cc08d9c1061199b800030dd 100644 --- a/sm5/smartd.h +++ b/sm5/smartd.h @@ -27,7 +27,7 @@ #ifndef SMARTD_H_CVSID -#define SMARTD_H_CVSID "$Id: smartd.h,v 1.55 2003/11/12 04:20:24 ballen4705 Exp $\n" +#define SMARTD_H_CVSID "$Id: smartd.h,v 1.56 2003/11/14 07:41:40 ballen4705 Exp $\n" #endif // Configuration file @@ -168,8 +168,8 @@ typedef struct configfile_s { char removable; // Device may disappear (not be present) char *emailcmdline; // Program for sending mail (or NULL) char *address; // Email addresses (or NULL) - int selflogcount; // bits 0:7 total number of self-test errors - // bits 8:XX lifetime hours of last self-test error + unsigned char selflogcount; // total number of self-test errors + unsigned short selfloghour; // lifetime hours of last self-test error // THE NEXT SET OF ENTRIES TRACK DEVICE STATE AND ARE DYNAMIC mailinfo maildata[10]; // Tracks type/date of email messages sent @@ -248,3 +248,6 @@ export NJAMD_TRACE_LIBS=1 #endif #endif + +#define SELFTEST_ERRORCOUNT(x) (x & 0xff) +#define SELFTEST_ERRORHOURS(x) ((x >> 8) & 0xffff)