diff --git a/sm5/CHANGELOG b/sm5/CHANGELOG index 7ff2960d2f520352a7417fc1346f55dbfe77aa79..4d762bb88705d8250905d65018478a29a012214a 100644 --- a/sm5/CHANGELOG +++ b/sm5/CHANGELOG @@ -1,6 +1,6 @@ CHANGELOG for smartmontools -$Id: CHANGELOG,v 1.394 2004/04/07 20:55:32 chrfranke Exp $ +$Id: CHANGELOG,v 1.395 2004/04/09 00:28:43 ballen4705 Exp $ The most recent version of this file is: http://cvs.sourceforge.net/viewcvs.py/smartmontools/sm5/CHANGELOG?sortby=date&view=markup @@ -27,6 +27,12 @@ NOTES FOR FUTURE RELEASES: see TODO file. <ADDITIONS TO THE CHANGE LOG SHOULD BE ADDED JUST BELOW HERE, PLEASE> + [BA] smartd: now monitor the Current Pending Sector count (Attribute 197) + and the Offline Pending Sector Count (Attribute 197). Log a + warning (and send an email, if so configured) if the raw count + is nonzero. These are controlled by new Directives: -C and -U. + Currently they are enabled by default. + [CF] Added option -c FILE, --configfile=FILE to smartd to specify an alternate configuration FILE or '-' for standard input. diff --git a/sm5/atacmds.c b/sm5/atacmds.c index 07fa1da3f50dec36d4fdd91d1f26fb72455fa116..85493a35a4b8abd868e6ea591222e174ff9ec139 100644 --- a/sm5/atacmds.c +++ b/sm5/atacmds.c @@ -35,7 +35,7 @@ #include "extern.h" #include "utility.h" -const char *atacmds_c_cvsid="$Id: atacmds.c,v 1.146 2004/03/26 14:22:08 ballen4705 Exp $" +const char *atacmds_c_cvsid="$Id: atacmds.c,v 1.147 2004/04/09 00:28:43 ballen4705 Exp $" ATACMDS_H_CVSID CONFIG_H_CVSID EXTERN_H_CVSID INT64_H_CVSID UTILITY_H_CVSID; // to hold onto exit code for atexit routine @@ -1786,3 +1786,41 @@ void ataPrintSmartAttribName(char *out, unsigned char id, unsigned char *definit sprintf(out,"%3hu %s",(short int)id,name); return; } + +// Returns raw value of Attribute with ID==id. This will be in the +// range 0 to 2^48-1 inclusive. If the Attribute does not exist, +// return -1. +int64_t ATAReturnAttributeRawValue(unsigned char id, struct ata_smart_values *data) { + int i; + + // valid Attribute IDs are in the range 1 to 255 inclusive. + if (!id || !data) + return -1; + + // loop over Attributes to see if there is one with the desired ID + for (i=0; i<NUMBER_ATA_SMART_ATTRIBUTES; i++) { + struct ata_smart_attribute *this = data->vendor_attributes + i; + if (this->id == id) { + // we've found the desired Attribute. Return its value + int64_t rawvalue=0; + int j; + + for (j=0; j<6; j++) { + // This looks a bit roundabout, but is necessary. Don't + // succumb to the temptation to use raw[j]<<(8*j) since under + // the normal rules this will be promoted to the native type. + // On a 32 bit machine this might then overflow. + int64_t temp; + temp = this->raw[j]; + temp <<= 8*j; + rawvalue |= temp; + } // loop over j + return rawvalue; + } // found desired Attribute + } // loop over Attributes + + // fall-through: no such Attribute found + return -1; +} + + diff --git a/sm5/atacmds.cpp b/sm5/atacmds.cpp index 768aa95ed4e90d67f4d413a88f40cfbb8a4d6a55..ac6aea8f7b2d1e5e140870fecd1b281fb57505bc 100644 --- a/sm5/atacmds.cpp +++ b/sm5/atacmds.cpp @@ -35,7 +35,7 @@ #include "extern.h" #include "utility.h" -const char *atacmds_c_cvsid="$Id: atacmds.cpp,v 1.146 2004/03/26 14:22:08 ballen4705 Exp $" +const char *atacmds_c_cvsid="$Id: atacmds.cpp,v 1.147 2004/04/09 00:28:43 ballen4705 Exp $" ATACMDS_H_CVSID CONFIG_H_CVSID EXTERN_H_CVSID INT64_H_CVSID UTILITY_H_CVSID; // to hold onto exit code for atexit routine @@ -1786,3 +1786,41 @@ void ataPrintSmartAttribName(char *out, unsigned char id, unsigned char *definit sprintf(out,"%3hu %s",(short int)id,name); return; } + +// Returns raw value of Attribute with ID==id. This will be in the +// range 0 to 2^48-1 inclusive. If the Attribute does not exist, +// return -1. +int64_t ATAReturnAttributeRawValue(unsigned char id, struct ata_smart_values *data) { + int i; + + // valid Attribute IDs are in the range 1 to 255 inclusive. + if (!id || !data) + return -1; + + // loop over Attributes to see if there is one with the desired ID + for (i=0; i<NUMBER_ATA_SMART_ATTRIBUTES; i++) { + struct ata_smart_attribute *this = data->vendor_attributes + i; + if (this->id == id) { + // we've found the desired Attribute. Return its value + int64_t rawvalue=0; + int j; + + for (j=0; j<6; j++) { + // This looks a bit roundabout, but is necessary. Don't + // succumb to the temptation to use raw[j]<<(8*j) since under + // the normal rules this will be promoted to the native type. + // On a 32 bit machine this might then overflow. + int64_t temp; + temp = this->raw[j]; + temp <<= 8*j; + rawvalue |= temp; + } // loop over j + return rawvalue; + } // found desired Attribute + } // loop over Attributes + + // fall-through: no such Attribute found + return -1; +} + + diff --git a/sm5/atacmds.h b/sm5/atacmds.h index 8506708c2870814185b810a3f70faa06e899888f..94b76fbc75bf1b0bf03481800ff3fd31b541908f 100644 --- a/sm5/atacmds.h +++ b/sm5/atacmds.h @@ -25,7 +25,7 @@ #ifndef ATACMDS_H_ #define ATACMDS_H_ -#define ATACMDS_H_CVSID "$Id: atacmds.h,v 1.67 2004/03/26 14:22:08 ballen4705 Exp $\n" +#define ATACMDS_H_CVSID "$Id: atacmds.h,v 1.68 2004/04/09 00:28:44 ballen4705 Exp $\n" #include "int64.h" @@ -447,6 +447,12 @@ int ataCheckAttribute(struct ata_smart_values *data, // Structure with the incorrect checksum. void checksumwarning(const char *string); +// Returns raw value of Attribute with ID==id. This will be in the +// range 0 to 2^48-1 inclusive. If the Attribute does not exist, +// return -1. +int64_t ATAReturnAttributeRawValue(unsigned char id, struct ata_smart_values *data); + + #define MAX_ATTRIBUTE_NUM 256 extern const char *vendorattributeargs[]; diff --git a/sm5/smartd.8.in b/sm5/smartd.8.in index e9c86ffac749be5e41ebf58183897bbafe4cd66d..7ec35c9668746ec3367058a343ee2636067af486 100644 --- a/sm5/smartd.8.in +++ b/sm5/smartd.8.in @@ -1,7 +1,7 @@ .ig Copyright (C) 2002-4 Bruce Allen <smartmontools-support@lists.sourceforge.net> -$Id: smartd.8.in,v 1.55 2004/04/07 22:06:13 ballen4705 Exp $ +$Id: smartd.8.in,v 1.56 2004/04/09 00:28:44 ballen4705 Exp $ This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -793,11 +793,10 @@ usually the desired behavior. .TP .B \-m ADD -Send a warning email to the email address -.B ADD -if the \'\-H\', \'\-l\', or \'\-f\' Directives detect a failure or a new -error, or if a SMART command to the disk fails. This Directive only -works in conjunction with these other Directives (or with the +Send a warning email to the email address \fBADD\fP if the \'\-H\', +\'\-l\', \'\-f\', \'\-C\', or \'\-O\' Directives detect a failure or a +new error, or if a SMART command to the disk fails. This Directive +only works in conjunction with these other Directives (or with the equivalent default \'\-a\' Directive). To prevent your email in-box from getting filled up with warning @@ -1109,12 +1108,46 @@ A common use of this Directive is to track the device Temperature different types of system behavior affects the values of certain Attributes. +.TP +.B \-C ID +Report if the current number of pending sectors is non-zero. Here +\fBID\fP is the id number of the Attribute whose raw value is the +Current Pending Sector count. The default value of \fBID\fP is 197. +To turn off this reporting, use ID\ =\ 0. The allowed range of ID is +0 to 255 inclusive. + +A pending sector is a disk sector (containing 512 bytes of your data) +which the device would like to mark as ``bad" and reallocate. +Typically this is because your computer tried to read that sector, and +the read failed because the data on it has been corrupted and has +inconsistent Error Checking and Correction (ECC) codes. This is +important to know, because it means that there is some unreadable data +on the disk. The problem of figuring out what file this data belongs +to is operating system and file system specific. You can typically +force the sector to reallocate by writing to it (translation: make the +device substitute a spare good sector for the bad one) but at the +price of losing the 512 bytes of data stored there. + +.TP +.B \-U ID +[ATA only] Report if the number of offline uncorrectable sectors is +non-zero. Here \fBID\fP is the id number of the Attribute whose raw +value is the Offline Uncorrectable sector count. The default value of +\fBID\fP is 198. To turn off this reporting, use ID\ =\ 0. The +allowed range of ID is 0 to 255 inclusive. + +An offline uncorrectable sector is a disk sector which was not +readable during an off\-line scan or a self\-test. This is important +to know, because if you have data stored in this disk sector, and you +need to read it, the read will fail. Please see the previous \'\-C\' +option for more details. + .TP .B \-F TYPE -Modifies the behavior of \fBsmartd\fP to compensate for some known and -understood device firmware bug. The arguments to this Directive are -exclusive, so that only the final Directive given is used. The valid -values are: +[ATA only] Modifies the behavior of \fBsmartd\fP to compensate for +some known and understood device firmware bug. The arguments to this +Directive are exclusive, so that only the final Directive given is +used. The valid values are: .I none \- Assume that the device firmware obeys the ATA specifications. This is @@ -1604,4 +1637,4 @@ smartmontools home page at \fBhttp://smartmontools.sourceforge.net/\fP . .SH CVS ID OF THIS PAGE: -$Id: smartd.8.in,v 1.55 2004/04/07 22:06:13 ballen4705 Exp $ +$Id: smartd.8.in,v 1.56 2004/04/09 00:28:44 ballen4705 Exp $ diff --git a/sm5/smartd.c b/sm5/smartd.c index 1bbaa8de563b38694194e63425abf525792b86d1..e04041aacd82eba565968b4d10d2860a0fe35997 100644 --- a/sm5/smartd.c +++ b/sm5/smartd.c @@ -98,7 +98,7 @@ int getdomainname(char *, int); /* no declaration in header files! */ extern const char *atacmdnames_c_cvsid, *atacmds_c_cvsid, *ataprint_c_cvsid, *escalade_c_cvsid, *knowndrives_c_cvsid, *os_XXXX_c_cvsid, *scsicmds_c_cvsid, *utility_c_cvsid; -static const char *filenameandversion="$Id: smartd.c,v 1.308 2004/04/07 20:55:31 chrfranke Exp $"; +static const char *filenameandversion="$Id: smartd.c,v 1.309 2004/04/09 00:28:44 ballen4705 Exp $"; #ifdef NEED_SOLARIS_ATA_CODE extern const char *os_solaris_ata_s_cvsid; #endif @@ -109,7 +109,7 @@ extern const char *syslog_win32_c_cvsid; extern const char *int64_vc6_c_cvsid; #endif #endif -const char *smartd_c_cvsid="$Id: smartd.c,v 1.308 2004/04/07 20:55:31 chrfranke Exp $" +const char *smartd_c_cvsid="$Id: smartd.c,v 1.309 2004/04/09 00:28:44 ballen4705 Exp $" ATACMDS_H_CVSID ATAPRINT_H_CVSID CONFIG_H_CVSID EXTERN_H_CVSID INT64_H_CVSID KNOWNDRIVES_H_CVSID SCSICMDS_H_CVSID SMARTD_H_CVSID #ifdef SYSLOG_H_CVSID @@ -190,6 +190,33 @@ volatile int caughtsigEXIT=0; jmp_buf registerscsienv; #endif +// tranlate cfg->pending into the correct Attribute numbers +void TranslatePending(unsigned short pending, unsigned char *current, unsigned char *offline) { + + unsigned char curr = CURR_PEND(pending); + unsigned char off = OFF_PEND(pending); + + // look for special value of CUR_UNC_DEFAULT that means DONT + // monitor. 0 means DO test. + if (curr==CUR_UNC_DEFAULT) + curr=0; + else if (curr==0) + curr=CUR_UNC_DEFAULT; + + // look for special value of OFF_UNC_DEFAULT that means DONT + // monitor. 0 means DO TEST. + if (off==OFF_UNC_DEFAULT) + off=0; + else if (off==0) + off=OFF_UNC_DEFAULT; + + *current=curr; + *offline=off; + + return; +} + + // free all memory associated with selftest part of configfile entry. Return NULL testinfo* FreeTestData(testinfo *data){ @@ -486,7 +513,9 @@ void MailWarning(cfgfile *cfg, int which, char *fmt, ...){ "FAILEDreadsmartdata", // 6 "FAILEDreadsmarterrorlog", // 7 "FAILEDreadsmartsefltestlog", // 8 - "FAILEDopendevice" // 9 + "FAILEDopendevice", // 9 + "CurrentPendingSector", // 10 + "OfflinePendingSector" // 11 }; char *address, *executable; @@ -961,6 +990,8 @@ void Directives() { " -P TYPE Drive-specific presets: use, ignore, show, showall\n" " -a Default: equivalent to -H -f -t -l error -l selftest\n" " -F TYPE Firmware bug workaround: none, samsung, samsung2\n" + " -C ID Monitor current pending sectors in Attribute ID\n" + " -U ID Monitor offline uncorrectable sectors in Attribute ID\n" " # Comment: text after a hash sign is ignored\n" " \\ Line continuation character\n" "Attribute ID is a decimal integer 1 <= ID <= 255\n" @@ -1228,7 +1259,10 @@ int ATADeviceScan(cfgfile *cfg){ retainsmartdata=cfg->usagefailed || cfg->prefail || cfg->usage; // do we need to get SMART data? - if (retainsmartdata || cfg->autoofflinetest || cfg->selftest || cfg->errorlog) { + if (retainsmartdata || cfg->autoofflinetest || cfg->selftest || cfg->errorlog || cfg->pending!=DONT_MONITOR_UNC) { + + unsigned char currentpending, offlinepending; + cfg->smartval=(struct ata_smart_values *)Calloc(1,sizeof(struct ata_smart_values)); cfg->smartthres=(struct ata_smart_thresholds_pvt *)Calloc(1,sizeof(struct ata_smart_thresholds_pvt)); @@ -1241,6 +1275,25 @@ int ATADeviceScan(cfgfile *cfg){ ataReadSmartThresholds (fd,cfg->smartthres)){ PrintOut(LOG_INFO,"Device: %s, Read SMART Values and/or Thresholds Failed\n",name); retainsmartdata=cfg->usagefailed=cfg->prefail=cfg->usage=0; + cfg->pending=DONT_MONITOR_UNC; + } + + // see if the necessary Attribute is there to monitor offline or + // current pending sectors + TranslatePending(cfg->pending, ¤tpending, &offlinepending); + + if (currentpending && ATAReturnAttributeRawValue(currentpending, cfg->smartval)<0) { + PrintOut(LOG_INFO,"Device: %s, can't monitor Current Pending Sector count - no Attribute %d\n", + name, (int)currentpending); + cfg->pending &= 0xff00; + cfg->pending |= CUR_UNC_DEFAULT; + } + + if (offlinepending && ATAReturnAttributeRawValue(offlinepending, cfg->smartval)<0) { + PrintOut(LOG_INFO,"Device: %s, can't monitor Offline Uncorrectable Sector count - no Attribute %d\n", + name, (int)offlinepending); + cfg->pending &= 0x00ff; + cfg->pending |= OFF_UNC_DEFAULT<<8; } } @@ -1910,7 +1963,7 @@ int ATACheckDevice(cfgfile *cfg){ } // Check everything that depends upon SMART Data (eg, Attribute values) - if (cfg->usagefailed || cfg->prefail || cfg->usage){ + if (cfg->usagefailed || cfg->prefail || cfg->usage || cfg->pending!=DONT_MONITOR_UNC){ struct ata_smart_values curval; struct ata_smart_thresholds_pvt *thresh=cfg->smartthres; @@ -1919,79 +1972,102 @@ int ATACheckDevice(cfgfile *cfg){ PrintOut(LOG_CRIT, "Device: %s, failed to read SMART Attribute Data\n", name); MailWarning(cfg, 6, "Device: %s, failed to read SMART Attribute Data", name); } - else { - // look for failed usage attributes, or track usage or prefail attributes - for (i=0; i<NUMBER_ATA_SMART_ATTRIBUTES; i++){ - int att; - changedattribute_t delta; - - // This block looks for usage attributes that have failed. - // Prefail attributes that have failed are returned with a - // positive sign. No failure returns 0. Usage attributes<0. - if (cfg->usagefailed && ((att=ataCheckAttribute(&curval, thresh, i))<0)){ - - // are we ignoring failures of this attribute? - att *= -1; - if (!IsAttributeOff(att, &cfg->monitorattflags, 0, MONITOR_FAILUSE, __LINE__)){ - char attname[64], *loc=attname; - - // get attribute name & skip white space - ataPrintSmartAttribName(loc, att, cfg->attributedefs); - while (*loc && *loc==' ') loc++; - - // warning message - PrintOut(LOG_CRIT, "Device: %s, Failed SMART usage Attribute: %s.\n", name, loc); - MailWarning(cfg, 2, "Device: %s, Failed SMART usage Attribute: %s.", name, loc); - } - } - - // This block tracks usage or prefailure attributes to see if - // they are changing. It also looks for changes in RAW values - // if this has been requested by user. - if ((cfg->usage || cfg->prefail) && ATACompareValues(&delta, &curval, cfg->smartval, thresh, i, name)){ - unsigned char id=delta.id; - - // if the only change is the raw value, and we're not - // tracking raw value, then continue loop over attributes - if (!delta.sameraw && delta.newval==delta.oldval && !IsAttributeOff(id, &cfg->monitorattflags, 0, MONITOR_RAW, __LINE__)) - continue; - - // are we tracking this attribute? - if (!IsAttributeOff(id, &cfg->monitorattflags, 0, MONITOR_IGNORE, __LINE__)){ - char newrawstring[64], oldrawstring[64], attname[64], *loc=attname; - - // get attribute name, skip spaces - ataPrintSmartAttribName(loc, id, cfg->attributedefs); - while (*loc && *loc==' ') loc++; - - // has the user asked for us to print raw values? - if (IsAttributeOff(id, &cfg->monitorattflags, 0, MONITOR_RAWPRINT, __LINE__)) { - // get raw values (as a string) and add to printout - char rawstring[64]; - ataPrintSmartAttribRawValue(rawstring, curval.vendor_attributes+i, cfg->attributedefs); - sprintf(newrawstring, " [Raw %s]", rawstring); - ataPrintSmartAttribRawValue(rawstring, cfg->smartval->vendor_attributes+i, cfg->attributedefs); - sprintf(oldrawstring, " [Raw %s]", rawstring); - } - else - newrawstring[0]=oldrawstring[0]='\0'; - - // prefailure attribute - if (cfg->prefail && delta.prefail) - PrintOut(LOG_INFO, "Device: %s, SMART Prefailure Attribute: %s changed from %d%s to %d%s\n", - name, loc, delta.oldval, oldrawstring, delta.newval, newrawstring); - - // usage attribute - if (cfg->usage && !delta.prefail) - PrintOut(LOG_INFO, "Device: %s, SMART Usage Attribute: %s changed from %d%s to %d%s\n", - name, loc, delta.oldval, oldrawstring, delta.newval, newrawstring); - } - } // endof block tracking usage or prefailure - } // end of loop over attributes - - // Save the new values into *drive for the next time around - *(cfg->smartval)=curval; - } + else { + // look for current or offline pending sectors + if (cfg->pending != DONT_MONITOR_UNC) { + int64_t rawval; + unsigned char currentpending, offlinepending; + + TranslatePending(cfg->pending, ¤tpending, &offlinepending); + + if (currentpending && (rawval=ATAReturnAttributeRawValue(currentpending, &curval))>0) { + // Unreadable pending sectors!! + PrintOut(LOG_CRIT, "Device: %s, %"PRIu64" Currently unreadable (pending) sectors\n", name, rawval); + MailWarning(cfg, 10, "Device: %s, %"PRIu64" Currently unreadable (pending) sectors", name, rawval); + } + + if (offlinepending && (rawval=ATAReturnAttributeRawValue(offlinepending, &curval))>0) { + // Unreadable offline sectors!! + PrintOut(LOG_CRIT, "Device: %s, %"PRIu64" Offline uncorrectable sectors\n", name, rawval); + MailWarning(cfg, 11, "Device: %s, %"PRIu64" Offline uncorrectable sectors", name, rawval); + } + } + + if (cfg->usagefailed || cfg->prefail || cfg->usage) { + + // look for failed usage attributes, or track usage or prefail attributes + for (i=0; i<NUMBER_ATA_SMART_ATTRIBUTES; i++){ + int att; + changedattribute_t delta; + + // This block looks for usage attributes that have failed. + // Prefail attributes that have failed are returned with a + // positive sign. No failure returns 0. Usage attributes<0. + if (cfg->usagefailed && ((att=ataCheckAttribute(&curval, thresh, i))<0)){ + + // are we ignoring failures of this attribute? + att *= -1; + if (!IsAttributeOff(att, &cfg->monitorattflags, 0, MONITOR_FAILUSE, __LINE__)){ + char attname[64], *loc=attname; + + // get attribute name & skip white space + ataPrintSmartAttribName(loc, att, cfg->attributedefs); + while (*loc && *loc==' ') loc++; + + // warning message + PrintOut(LOG_CRIT, "Device: %s, Failed SMART usage Attribute: %s.\n", name, loc); + MailWarning(cfg, 2, "Device: %s, Failed SMART usage Attribute: %s.", name, loc); + } + } + + // This block tracks usage or prefailure attributes to see if + // they are changing. It also looks for changes in RAW values + // if this has been requested by user. + if ((cfg->usage || cfg->prefail) && ATACompareValues(&delta, &curval, cfg->smartval, thresh, i, name)){ + unsigned char id=delta.id; + + // if the only change is the raw value, and we're not + // tracking raw value, then continue loop over attributes + if (!delta.sameraw && delta.newval==delta.oldval && !IsAttributeOff(id, &cfg->monitorattflags, 0, MONITOR_RAW, __LINE__)) + continue; + + // are we tracking this attribute? + if (!IsAttributeOff(id, &cfg->monitorattflags, 0, MONITOR_IGNORE, __LINE__)){ + char newrawstring[64], oldrawstring[64], attname[64], *loc=attname; + + // get attribute name, skip spaces + ataPrintSmartAttribName(loc, id, cfg->attributedefs); + while (*loc && *loc==' ') loc++; + + // has the user asked for us to print raw values? + if (IsAttributeOff(id, &cfg->monitorattflags, 0, MONITOR_RAWPRINT, __LINE__)) { + // get raw values (as a string) and add to printout + char rawstring[64]; + ataPrintSmartAttribRawValue(rawstring, curval.vendor_attributes+i, cfg->attributedefs); + sprintf(newrawstring, " [Raw %s]", rawstring); + ataPrintSmartAttribRawValue(rawstring, cfg->smartval->vendor_attributes+i, cfg->attributedefs); + sprintf(oldrawstring, " [Raw %s]", rawstring); + } + else + newrawstring[0]=oldrawstring[0]='\0'; + + // prefailure attribute + if (cfg->prefail && delta.prefail) + PrintOut(LOG_INFO, "Device: %s, SMART Prefailure Attribute: %s changed from %d%s to %d%s\n", + name, loc, delta.oldval, oldrawstring, delta.newval, newrawstring); + + // usage attribute + if (cfg->usage && !delta.prefail) + PrintOut(LOG_INFO, "Device: %s, SMART Usage Attribute: %s changed from %d%s to %d%s\n", + name, loc, delta.oldval, oldrawstring, delta.newval, newrawstring); + } + } // endof block tracking usage or prefailure + } // end of loop over attributes + + // Save the new values into *drive for the next time around + *(cfg->smartval)=curval; + } + } } // check if number of selftest errors has increased (note: may also DECREASE) @@ -2008,8 +2084,9 @@ int ATACheckDevice(cfgfile *cfg){ // did command fail? if (new<0) + // lack of PrintOut here is INTENTIONAL MailWarning(cfg, 7, "Device: %s, Read SMART Error Log Failed", name); - + // has error count increased? if (new>old){ PrintOut(LOG_CRIT, "Device: %s, ATA error count increased from %d to %d\n", @@ -2065,8 +2142,9 @@ int SCSICheckDevice(cfgfile *cfg) // if we can't open device, fail gracefully rather than hard -- // perhaps the next time around we'll be able to open it if ((fd=OpenDevice(name, "SCSI"))<0) { - MailWarning(cfg, 9, "Device: %s, unable to open device", name); - return 1; + // Lack of PrintOut() here is intentional! + MailWarning(cfg, 9, "Device: %s, unable to open device", name); + return 1; } currenttemp = 0; asc = 0; @@ -2378,6 +2456,30 @@ int ParseToken(char *token,cfgfile *cfg){ switch (sym) { int val; + case 'C': + // monitor current pending sector count (default 197) + if ((val=GetInteger(arg=strtok(NULL,delim), name, token, lineno, configfile, 0, 255))<0) + return -1; + if (val==CUR_UNC_DEFAULT) + val=0; + else if (val==0) + val=CUR_UNC_DEFAULT; + // set bottom 8 bits to correct value + cfg->pending &= 0xff00; + cfg->pending |= val; + break; + case 'U': + // monitor offline uncorrectable sectors (default 198) + if ((val=GetInteger(arg=strtok(NULL,delim), name, token, lineno, configfile, 0, 255))<0) + return -1; + if (val==OFF_UNC_DEFAULT) + val=0; + else if (val==0) + val=OFF_UNC_DEFAULT; + // turn off top 8 bits, then set to correct value + cfg->pending &= 0xff; + cfg->pending |= (val<<8); + break; case 'T': // Set tolerance level for SMART command failures if ((arg = strtok(NULL, delim)) == NULL) { diff --git a/sm5/smartd.conf.5.in b/sm5/smartd.conf.5.in index 8b6929457b6b64ead5c4fbf01feefe0d460f836e..57033d24c0cb6653525a432c37ddf2eac46c18e4 100644 --- a/sm5/smartd.conf.5.in +++ b/sm5/smartd.conf.5.in @@ -1,7 +1,7 @@ .ig Copyright (C) 2002-4 Bruce Allen <smartmontools-support@lists.sourceforge.net> -$Id: smartd.conf.5.in,v 1.42 2004/04/07 22:06:13 ballen4705 Exp $ +$Id: smartd.conf.5.in,v 1.43 2004/04/09 00:28:44 ballen4705 Exp $ This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free @@ -465,11 +465,10 @@ usually the desired behavior. .TP .B \-m ADD -Send a warning email to the email address -.B ADD -if the \'\-H\', \'\-l\', or \'\-f\' Directives detect a failure or a new -error, or if a SMART command to the disk fails. This Directive only -works in conjunction with these other Directives (or with the +Send a warning email to the email address \fBADD\fP if the \'\-H\', +\'\-l\', \'\-f\', \'\-C\', or \'\-O\' Directives detect a failure or a +new error, or if a SMART command to the disk fails. This Directive +only works in conjunction with these other Directives (or with the equivalent default \'\-a\' Directive). To prevent your email in-box from getting filled up with warning @@ -781,12 +780,46 @@ A common use of this Directive is to track the device Temperature different types of system behavior affects the values of certain Attributes. +.TP +.B \-C ID +Report if the current number of pending sectors is non-zero. Here +\fBID\fP is the id number of the Attribute whose raw value is the +Current Pending Sector count. The default value of \fBID\fP is 197. +To turn off this reporting, use ID\ =\ 0. The allowed range of ID is +0 to 255 inclusive. + +A pending sector is a disk sector (containing 512 bytes of your data) +which the device would like to mark as ``bad" and reallocate. +Typically this is because your computer tried to read that sector, and +the read failed because the data on it has been corrupted and has +inconsistent Error Checking and Correction (ECC) codes. This is +important to know, because it means that there is some unreadable data +on the disk. The problem of figuring out what file this data belongs +to is operating system and file system specific. You can typically +force the sector to reallocate by writing to it (translation: make the +device substitute a spare good sector for the bad one) but at the +price of losing the 512 bytes of data stored there. + +.TP +.B \-U ID +[ATA only] Report if the number of offline uncorrectable sectors is +non-zero. Here \fBID\fP is the id number of the Attribute whose raw +value is the Offline Uncorrectable sector count. The default value of +\fBID\fP is 198. To turn off this reporting, use ID\ =\ 0. The +allowed range of ID is 0 to 255 inclusive. + +An offline uncorrectable sector is a disk sector which was not +readable during an off\-line scan or a self\-test. This is important +to know, because if you have data stored in this disk sector, and you +need to read it, the read will fail. Please see the previous \'\-C\' +option for more details. + .TP .B \-F TYPE -Modifies the behavior of \fBsmartd\fP to compensate for some known and -understood device firmware bug. The arguments to this Directive are -exclusive, so that only the final Directive given is used. The valid -values are: +[ATA only] Modifies the behavior of \fBsmartd\fP to compensate for +some known and understood device firmware bug. The arguments to this +Directive are exclusive, so that only the final Directive given is +used. The valid values are: .I none \- Assume that the device firmware obeys the ATA specifications. This is @@ -1112,4 +1145,4 @@ SEE ALSO: .SH CVS ID OF THIS PAGE: -$Id: smartd.conf.5.in,v 1.42 2004/04/07 22:06:13 ballen4705 Exp $ +$Id: smartd.conf.5.in,v 1.43 2004/04/09 00:28:44 ballen4705 Exp $ diff --git a/sm5/smartd.cpp b/sm5/smartd.cpp index a28f54489ccf6f854cabc1b0f5eab72c06dee965..1edeb27b832da28b0b45db3da053696939f59512 100644 --- a/sm5/smartd.cpp +++ b/sm5/smartd.cpp @@ -98,7 +98,7 @@ int getdomainname(char *, int); /* no declaration in header files! */ extern const char *atacmdnames_c_cvsid, *atacmds_c_cvsid, *ataprint_c_cvsid, *escalade_c_cvsid, *knowndrives_c_cvsid, *os_XXXX_c_cvsid, *scsicmds_c_cvsid, *utility_c_cvsid; -static const char *filenameandversion="$Id: smartd.cpp,v 1.308 2004/04/07 20:55:31 chrfranke Exp $"; +static const char *filenameandversion="$Id: smartd.cpp,v 1.309 2004/04/09 00:28:44 ballen4705 Exp $"; #ifdef NEED_SOLARIS_ATA_CODE extern const char *os_solaris_ata_s_cvsid; #endif @@ -109,7 +109,7 @@ extern const char *syslog_win32_c_cvsid; extern const char *int64_vc6_c_cvsid; #endif #endif -const char *smartd_c_cvsid="$Id: smartd.cpp,v 1.308 2004/04/07 20:55:31 chrfranke Exp $" +const char *smartd_c_cvsid="$Id: smartd.cpp,v 1.309 2004/04/09 00:28:44 ballen4705 Exp $" ATACMDS_H_CVSID ATAPRINT_H_CVSID CONFIG_H_CVSID EXTERN_H_CVSID INT64_H_CVSID KNOWNDRIVES_H_CVSID SCSICMDS_H_CVSID SMARTD_H_CVSID #ifdef SYSLOG_H_CVSID @@ -190,6 +190,33 @@ volatile int caughtsigEXIT=0; jmp_buf registerscsienv; #endif +// tranlate cfg->pending into the correct Attribute numbers +void TranslatePending(unsigned short pending, unsigned char *current, unsigned char *offline) { + + unsigned char curr = CURR_PEND(pending); + unsigned char off = OFF_PEND(pending); + + // look for special value of CUR_UNC_DEFAULT that means DONT + // monitor. 0 means DO test. + if (curr==CUR_UNC_DEFAULT) + curr=0; + else if (curr==0) + curr=CUR_UNC_DEFAULT; + + // look for special value of OFF_UNC_DEFAULT that means DONT + // monitor. 0 means DO TEST. + if (off==OFF_UNC_DEFAULT) + off=0; + else if (off==0) + off=OFF_UNC_DEFAULT; + + *current=curr; + *offline=off; + + return; +} + + // free all memory associated with selftest part of configfile entry. Return NULL testinfo* FreeTestData(testinfo *data){ @@ -486,7 +513,9 @@ void MailWarning(cfgfile *cfg, int which, char *fmt, ...){ "FAILEDreadsmartdata", // 6 "FAILEDreadsmarterrorlog", // 7 "FAILEDreadsmartsefltestlog", // 8 - "FAILEDopendevice" // 9 + "FAILEDopendevice", // 9 + "CurrentPendingSector", // 10 + "OfflinePendingSector" // 11 }; char *address, *executable; @@ -961,6 +990,8 @@ void Directives() { " -P TYPE Drive-specific presets: use, ignore, show, showall\n" " -a Default: equivalent to -H -f -t -l error -l selftest\n" " -F TYPE Firmware bug workaround: none, samsung, samsung2\n" + " -C ID Monitor current pending sectors in Attribute ID\n" + " -U ID Monitor offline uncorrectable sectors in Attribute ID\n" " # Comment: text after a hash sign is ignored\n" " \\ Line continuation character\n" "Attribute ID is a decimal integer 1 <= ID <= 255\n" @@ -1228,7 +1259,10 @@ int ATADeviceScan(cfgfile *cfg){ retainsmartdata=cfg->usagefailed || cfg->prefail || cfg->usage; // do we need to get SMART data? - if (retainsmartdata || cfg->autoofflinetest || cfg->selftest || cfg->errorlog) { + if (retainsmartdata || cfg->autoofflinetest || cfg->selftest || cfg->errorlog || cfg->pending!=DONT_MONITOR_UNC) { + + unsigned char currentpending, offlinepending; + cfg->smartval=(struct ata_smart_values *)Calloc(1,sizeof(struct ata_smart_values)); cfg->smartthres=(struct ata_smart_thresholds_pvt *)Calloc(1,sizeof(struct ata_smart_thresholds_pvt)); @@ -1241,6 +1275,25 @@ int ATADeviceScan(cfgfile *cfg){ ataReadSmartThresholds (fd,cfg->smartthres)){ PrintOut(LOG_INFO,"Device: %s, Read SMART Values and/or Thresholds Failed\n",name); retainsmartdata=cfg->usagefailed=cfg->prefail=cfg->usage=0; + cfg->pending=DONT_MONITOR_UNC; + } + + // see if the necessary Attribute is there to monitor offline or + // current pending sectors + TranslatePending(cfg->pending, ¤tpending, &offlinepending); + + if (currentpending && ATAReturnAttributeRawValue(currentpending, cfg->smartval)<0) { + PrintOut(LOG_INFO,"Device: %s, can't monitor Current Pending Sector count - no Attribute %d\n", + name, (int)currentpending); + cfg->pending &= 0xff00; + cfg->pending |= CUR_UNC_DEFAULT; + } + + if (offlinepending && ATAReturnAttributeRawValue(offlinepending, cfg->smartval)<0) { + PrintOut(LOG_INFO,"Device: %s, can't monitor Offline Uncorrectable Sector count - no Attribute %d\n", + name, (int)offlinepending); + cfg->pending &= 0x00ff; + cfg->pending |= OFF_UNC_DEFAULT<<8; } } @@ -1910,7 +1963,7 @@ int ATACheckDevice(cfgfile *cfg){ } // Check everything that depends upon SMART Data (eg, Attribute values) - if (cfg->usagefailed || cfg->prefail || cfg->usage){ + if (cfg->usagefailed || cfg->prefail || cfg->usage || cfg->pending!=DONT_MONITOR_UNC){ struct ata_smart_values curval; struct ata_smart_thresholds_pvt *thresh=cfg->smartthres; @@ -1919,79 +1972,102 @@ int ATACheckDevice(cfgfile *cfg){ PrintOut(LOG_CRIT, "Device: %s, failed to read SMART Attribute Data\n", name); MailWarning(cfg, 6, "Device: %s, failed to read SMART Attribute Data", name); } - else { - // look for failed usage attributes, or track usage or prefail attributes - for (i=0; i<NUMBER_ATA_SMART_ATTRIBUTES; i++){ - int att; - changedattribute_t delta; - - // This block looks for usage attributes that have failed. - // Prefail attributes that have failed are returned with a - // positive sign. No failure returns 0. Usage attributes<0. - if (cfg->usagefailed && ((att=ataCheckAttribute(&curval, thresh, i))<0)){ - - // are we ignoring failures of this attribute? - att *= -1; - if (!IsAttributeOff(att, &cfg->monitorattflags, 0, MONITOR_FAILUSE, __LINE__)){ - char attname[64], *loc=attname; - - // get attribute name & skip white space - ataPrintSmartAttribName(loc, att, cfg->attributedefs); - while (*loc && *loc==' ') loc++; - - // warning message - PrintOut(LOG_CRIT, "Device: %s, Failed SMART usage Attribute: %s.\n", name, loc); - MailWarning(cfg, 2, "Device: %s, Failed SMART usage Attribute: %s.", name, loc); - } - } - - // This block tracks usage or prefailure attributes to see if - // they are changing. It also looks for changes in RAW values - // if this has been requested by user. - if ((cfg->usage || cfg->prefail) && ATACompareValues(&delta, &curval, cfg->smartval, thresh, i, name)){ - unsigned char id=delta.id; - - // if the only change is the raw value, and we're not - // tracking raw value, then continue loop over attributes - if (!delta.sameraw && delta.newval==delta.oldval && !IsAttributeOff(id, &cfg->monitorattflags, 0, MONITOR_RAW, __LINE__)) - continue; - - // are we tracking this attribute? - if (!IsAttributeOff(id, &cfg->monitorattflags, 0, MONITOR_IGNORE, __LINE__)){ - char newrawstring[64], oldrawstring[64], attname[64], *loc=attname; - - // get attribute name, skip spaces - ataPrintSmartAttribName(loc, id, cfg->attributedefs); - while (*loc && *loc==' ') loc++; - - // has the user asked for us to print raw values? - if (IsAttributeOff(id, &cfg->monitorattflags, 0, MONITOR_RAWPRINT, __LINE__)) { - // get raw values (as a string) and add to printout - char rawstring[64]; - ataPrintSmartAttribRawValue(rawstring, curval.vendor_attributes+i, cfg->attributedefs); - sprintf(newrawstring, " [Raw %s]", rawstring); - ataPrintSmartAttribRawValue(rawstring, cfg->smartval->vendor_attributes+i, cfg->attributedefs); - sprintf(oldrawstring, " [Raw %s]", rawstring); - } - else - newrawstring[0]=oldrawstring[0]='\0'; - - // prefailure attribute - if (cfg->prefail && delta.prefail) - PrintOut(LOG_INFO, "Device: %s, SMART Prefailure Attribute: %s changed from %d%s to %d%s\n", - name, loc, delta.oldval, oldrawstring, delta.newval, newrawstring); - - // usage attribute - if (cfg->usage && !delta.prefail) - PrintOut(LOG_INFO, "Device: %s, SMART Usage Attribute: %s changed from %d%s to %d%s\n", - name, loc, delta.oldval, oldrawstring, delta.newval, newrawstring); - } - } // endof block tracking usage or prefailure - } // end of loop over attributes - - // Save the new values into *drive for the next time around - *(cfg->smartval)=curval; - } + else { + // look for current or offline pending sectors + if (cfg->pending != DONT_MONITOR_UNC) { + int64_t rawval; + unsigned char currentpending, offlinepending; + + TranslatePending(cfg->pending, ¤tpending, &offlinepending); + + if (currentpending && (rawval=ATAReturnAttributeRawValue(currentpending, &curval))>0) { + // Unreadable pending sectors!! + PrintOut(LOG_CRIT, "Device: %s, %"PRIu64" Currently unreadable (pending) sectors\n", name, rawval); + MailWarning(cfg, 10, "Device: %s, %"PRIu64" Currently unreadable (pending) sectors", name, rawval); + } + + if (offlinepending && (rawval=ATAReturnAttributeRawValue(offlinepending, &curval))>0) { + // Unreadable offline sectors!! + PrintOut(LOG_CRIT, "Device: %s, %"PRIu64" Offline uncorrectable sectors\n", name, rawval); + MailWarning(cfg, 11, "Device: %s, %"PRIu64" Offline uncorrectable sectors", name, rawval); + } + } + + if (cfg->usagefailed || cfg->prefail || cfg->usage) { + + // look for failed usage attributes, or track usage or prefail attributes + for (i=0; i<NUMBER_ATA_SMART_ATTRIBUTES; i++){ + int att; + changedattribute_t delta; + + // This block looks for usage attributes that have failed. + // Prefail attributes that have failed are returned with a + // positive sign. No failure returns 0. Usage attributes<0. + if (cfg->usagefailed && ((att=ataCheckAttribute(&curval, thresh, i))<0)){ + + // are we ignoring failures of this attribute? + att *= -1; + if (!IsAttributeOff(att, &cfg->monitorattflags, 0, MONITOR_FAILUSE, __LINE__)){ + char attname[64], *loc=attname; + + // get attribute name & skip white space + ataPrintSmartAttribName(loc, att, cfg->attributedefs); + while (*loc && *loc==' ') loc++; + + // warning message + PrintOut(LOG_CRIT, "Device: %s, Failed SMART usage Attribute: %s.\n", name, loc); + MailWarning(cfg, 2, "Device: %s, Failed SMART usage Attribute: %s.", name, loc); + } + } + + // This block tracks usage or prefailure attributes to see if + // they are changing. It also looks for changes in RAW values + // if this has been requested by user. + if ((cfg->usage || cfg->prefail) && ATACompareValues(&delta, &curval, cfg->smartval, thresh, i, name)){ + unsigned char id=delta.id; + + // if the only change is the raw value, and we're not + // tracking raw value, then continue loop over attributes + if (!delta.sameraw && delta.newval==delta.oldval && !IsAttributeOff(id, &cfg->monitorattflags, 0, MONITOR_RAW, __LINE__)) + continue; + + // are we tracking this attribute? + if (!IsAttributeOff(id, &cfg->monitorattflags, 0, MONITOR_IGNORE, __LINE__)){ + char newrawstring[64], oldrawstring[64], attname[64], *loc=attname; + + // get attribute name, skip spaces + ataPrintSmartAttribName(loc, id, cfg->attributedefs); + while (*loc && *loc==' ') loc++; + + // has the user asked for us to print raw values? + if (IsAttributeOff(id, &cfg->monitorattflags, 0, MONITOR_RAWPRINT, __LINE__)) { + // get raw values (as a string) and add to printout + char rawstring[64]; + ataPrintSmartAttribRawValue(rawstring, curval.vendor_attributes+i, cfg->attributedefs); + sprintf(newrawstring, " [Raw %s]", rawstring); + ataPrintSmartAttribRawValue(rawstring, cfg->smartval->vendor_attributes+i, cfg->attributedefs); + sprintf(oldrawstring, " [Raw %s]", rawstring); + } + else + newrawstring[0]=oldrawstring[0]='\0'; + + // prefailure attribute + if (cfg->prefail && delta.prefail) + PrintOut(LOG_INFO, "Device: %s, SMART Prefailure Attribute: %s changed from %d%s to %d%s\n", + name, loc, delta.oldval, oldrawstring, delta.newval, newrawstring); + + // usage attribute + if (cfg->usage && !delta.prefail) + PrintOut(LOG_INFO, "Device: %s, SMART Usage Attribute: %s changed from %d%s to %d%s\n", + name, loc, delta.oldval, oldrawstring, delta.newval, newrawstring); + } + } // endof block tracking usage or prefailure + } // end of loop over attributes + + // Save the new values into *drive for the next time around + *(cfg->smartval)=curval; + } + } } // check if number of selftest errors has increased (note: may also DECREASE) @@ -2008,8 +2084,9 @@ int ATACheckDevice(cfgfile *cfg){ // did command fail? if (new<0) + // lack of PrintOut here is INTENTIONAL MailWarning(cfg, 7, "Device: %s, Read SMART Error Log Failed", name); - + // has error count increased? if (new>old){ PrintOut(LOG_CRIT, "Device: %s, ATA error count increased from %d to %d\n", @@ -2065,8 +2142,9 @@ int SCSICheckDevice(cfgfile *cfg) // if we can't open device, fail gracefully rather than hard -- // perhaps the next time around we'll be able to open it if ((fd=OpenDevice(name, "SCSI"))<0) { - MailWarning(cfg, 9, "Device: %s, unable to open device", name); - return 1; + // Lack of PrintOut() here is intentional! + MailWarning(cfg, 9, "Device: %s, unable to open device", name); + return 1; } currenttemp = 0; asc = 0; @@ -2378,6 +2456,30 @@ int ParseToken(char *token,cfgfile *cfg){ switch (sym) { int val; + case 'C': + // monitor current pending sector count (default 197) + if ((val=GetInteger(arg=strtok(NULL,delim), name, token, lineno, configfile, 0, 255))<0) + return -1; + if (val==CUR_UNC_DEFAULT) + val=0; + else if (val==0) + val=CUR_UNC_DEFAULT; + // set bottom 8 bits to correct value + cfg->pending &= 0xff00; + cfg->pending |= val; + break; + case 'U': + // monitor offline uncorrectable sectors (default 198) + if ((val=GetInteger(arg=strtok(NULL,delim), name, token, lineno, configfile, 0, 255))<0) + return -1; + if (val==OFF_UNC_DEFAULT) + val=0; + else if (val==0) + val=OFF_UNC_DEFAULT; + // turn off top 8 bits, then set to correct value + cfg->pending &= 0xff; + cfg->pending |= (val<<8); + break; case 'T': // Set tolerance level for SMART command failures if ((arg = strtok(NULL, delim)) == NULL) { diff --git a/sm5/smartd.h b/sm5/smartd.h index 0af4555abc13979adc3b07aa5428eda6f308a80e..217f703d3dcccb0c211c1088eab79af3cb0df74c 100644 --- a/sm5/smartd.h +++ b/sm5/smartd.h @@ -32,7 +32,7 @@ #ifndef SMARTD_H_CVSID -#define SMARTD_H_CVSID "$Id: smartd.h,v 1.69 2004/01/27 06:19:39 shattered Exp $\n" +#define SMARTD_H_CVSID "$Id: smartd.h,v 1.70 2004/04/09 00:28:44 ballen4705 Exp $\n" #endif // Configuration file @@ -64,7 +64,7 @@ // Number of allowed mail message types -#define SMARTD_NMAIL 10 +#define SMARTD_NMAIL 12 typedef struct mailinfo_s { int logged;// number of times an email has been sent @@ -76,7 +76,7 @@ typedef struct mailinfo_s { // stores the information about them, and track type/date of email // messages. typedef struct maildata_s { - mailinfo maillog[10]; // log info on when mail sent + mailinfo maillog[SMARTD_NMAIL]; // log info on when mail sent char *emailcmdline; // script to execute char *address; // email address, or null unsigned char emailfreq; // Emails once (1) daily (2) diminishing (3) @@ -188,7 +188,8 @@ typedef struct configfile_s { unsigned char selflogcount; // total number of self-test errors unsigned short selfloghour; // lifetime hours of last self-test error testinfo *testdata; // Pointer to data on scheduled testing - + unsigned short pending; // lower 8 bits: ID of current pending sector count + // upper 8 bits: ID of offline pending sector count // THE NEXT SET OF ENTRIES ALSO TRACK DEVICE STATE AND ARE DYNAMIC maildata *mailwarn; // non-NULL: info about sending mail or executing script @@ -273,3 +274,18 @@ export NJAMD_TRACE_LIBS=1 #define SELFTEST_ERRORCOUNT(x) (x & 0xff) #define SELFTEST_ERRORHOURS(x) ((x >> 8) & 0xffff) + +// cfg->pending is a 16 bit unsigned quantity. If the least +// significant 8 bits are zero, this means monitor Attribute +// CUR_UNC_DEFAULT's raw value. If they are CUR_UNC_DEFAULT, this +// means DON'T MONITOR. If the most significant 8 bits are zero, this +// means monitor Attribute OFF_UNC_DEFAULT's raw value. If they are +// OFF_UNC_DEFAULT, this means DON'T MONITOR. +#define OFF_UNC_DEFAULT 198 +#define CUR_UNC_DEFAULT 197 + +#define CURR_PEND(x) (x & 0xff) +#define OFF_PEND(x) ((x >> 8) & 0xff) + +// if cfg->pending has this value, dont' monitor +#define DONT_MONITOR_UNC (256*OFF_UNC_DEFAULT+CUR_UNC_DEFAULT)