Vesta: /vesta/vestasys.org/basics/basics/33/src/TextCommon.C Source File

00001 // Copyright (C) 2001, Compaq Computer Corporation
00002 
00003 // This file is part of Vesta.
00004 
00005 // Vesta is free software; you can redistribute it and/or
00006 // modify it under the terms of the GNU Lesser General Public
00007 // License as published by the Free Software Foundation; either
00008 // version 2.1 of the License, or (at your option) any later version.
00009 
00010 // Vesta is distributed in the hope that it will be useful,
00011 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00012 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013 // Lesser General Public License for more details.
00014 
00015 // You should have received a copy of the GNU Lesser General Public
00016 // License along with Vesta; if not, write to the Free Software
00017 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00018 
00019 // Last modified on Sun Jan  8 20:18:58 EST 2006 by ken@xorian.net        
00020 //      modified on Sun Jul 28 11:36:00 EDT 2002 by lken@remote.xorian.net
00021 //      modified on Tue Jun 18 16:03:13 EDT 2002 by kcschalk@shr.intel.com
00022 //      modified on Tue Feb  8 11:30:10 PST 2000 by mann   
00023 //      modified on Wed Jul 16 16:36:33 PDT 1997 by heydon 
00024 
00025 // Code shared by the "Text.C" and "TextGC.C" implementations. This file
00026 // is #included by both those files.
00027 
00028 #include "Basics.H"
00029 #include "Text.H"
00030 
00031 // The empty string to which the Text() constructor initializes new Text's
00032 static const char* const EmptyStr = "";
00033 
00034 static const char *StrCopy(const char *str) throw ()
00035 // Returns a newly allocated copy of the null-terminated string "str".
00036 {
00037     int len = strlen(str) + 1;  // include +1 for null terminator
00038     char *res = NEW_PTRFREE_ARRAY(char, len);
00039     memcpy(res, str, len);
00040     return res;
00041 }
00042 
00043 static const char *StrCopy(const char *s, int len) throw ()
00044 /* Returns a newly-allocated, null-terminated copy of the "len" chars pointed
00045    to by "s"; it is an unchecked run-time error for any of those bytes to
00046    be the null character. */
00047 {
00048     char *res = NEW_PTRFREE_ARRAY(char, len+1);
00049     memcpy(res, s, len);
00050     res[len] = '\0';
00051     return res;
00052 }
00053 
00054 // constructors
00055 Text::Text() throw ()
00056 {
00057     this->s = EmptyStr;
00058 }
00059 
00060 Text::Text(const char *str, void *copy) throw ()
00061 {
00062     this->s = (copy == (void *)NULL) ? StrCopy(str) : str;
00063 }
00064 
00065 Text::Text(const char *bytes, int len) throw ()
00066 {
00067     this->s = StrCopy(bytes, len);
00068 }
00069 
00070 Text::Text(const std::string &str) throw()
00071 {
00072   this->s = StrCopy(str.c_str());
00073 }
00074 
00075 Text Text::Sub(int start, int len) const throw ()
00076 {
00077     int tLen = Length();
00078     Text res;
00079     if (start >= tLen || len == 0) {
00080         res.s = EmptyStr;
00081     } else {
00082         if ((unsigned int)start + (unsigned int)len > tLen) {
00083             // In this case, we can include the null terminator in the copy
00084             len = tLen - start + 1; // include +1 for null terminator
00085             char *temp = NEW_PTRFREE_ARRAY(char, len);
00086             memcpy(temp, this->s + start, len);
00087             res.s = temp;
00088         } else {
00089             // include +1 for null terminator
00090             char *temp = NEW_PTRFREE_ARRAY(char, len+1);
00091             memcpy(temp, this->s + start, len);
00092             temp[len] = '\0';
00093             res.s = temp;
00094         }
00095     }
00096     return res;
00097 }
00098 
00099 int Text::FindChar(char c, int start) const throw ()
00100 {
00101     int tLen = Length();
00102     for (register int i = max(start, 0); i < tLen; i++) {
00103         if (this->s[i] == c) return i;
00104     }
00105     return -1;
00106 }
00107 
00108 int Text::FindCharR(char c, int start) const throw ()
00109 {
00110     int tLen = Length();
00111     for (register int i = min(start, tLen - 1); i >= 0; i--) {
00112         if (this->s[i] == c) return i;
00113     }
00114     return -1;
00115 }
00116 
00117 int Text::FindText(const Text &substr, int start) const throw ()
00118 {
00119     start = max(start, 0);
00120     if (start + substr.Length() > this->Length()) return -1;
00121     char *match = strstr(this->s + start, substr.s);
00122     if (match != (char *)NULL) return (match - this->s);
00123     return -1;
00124 }
00125 
00126 const int
00127   N = sizeof(Word),             // bytes per Word
00128   ByteBits = 8,                 // bits per byte
00129   WordBits = N * 8;             // bits per Word
00130 
00131 Word RotateWord(Word w, int b) throw ()
00132 // Returns the result of rotating "w" up by "b" bits (or down by -b
00133 // bits if b is negative).
00134 {
00135     b = (b % WordBits);
00136 
00137     // We know that on big-endian systems this function will only be
00138     // called with negative values and on little-endian systems it
00139     // will only be called with positive values, so we optimize away
00140     // the unused clause.
00141 #if BYTE_ORDER == BIG_ENDIAN
00142     if (b < 0)
00143       {
00144         b = -b;
00145         Word hiMask = (((Word) -1) << b);
00146         Word loMask = (((Word) -1) ^ hiMask);
00147         w = ((w & loMask) << (WordBits - b)) | ((w & hiMask) >> b);
00148       }
00149 #else
00150     // else
00151     if (b > 0)
00152       {
00153         Word hiMask = (((Word) -1) << (WordBits - b));
00154         Word loMask = (((Word) -1) ^ hiMask);
00155         w = ((w & loMask) << b) | ((w & hiMask) >> (WordBits - b));
00156       }
00157 #endif
00158 
00159     return w;
00160 }
00161 
00162 const int
00163   Up1 = ByteBits,               // bits per byte
00164   LgUp1 = 3;                    // log_2(Up1)
00165 
00166 Word Text::Hash() const throw ()
00167   // This function is based on the implementation in the file
00168   // "UnsafeHash.m3" that is part of the SRC Modula-3 distribution.
00169   // That file has a more complete description of the method.
00170 
00171   // Conceptually, this can be thought of as a faster version of this:
00172 
00173   //    Word res = 0;
00174   //    unsinged int N = sizeof(Word);
00175   //    char *resc = ((char *) res);
00176   //    for(unsigned int i = 0; i < strlen(this->s); i++)
00177   //    {
00178   //      res_c[i % N] = res_c[i % N] ^ this->s[i];
00179   //    }
00180   //    res += strlen(this->s);
00181   //    return res;
00182 
00183   // Although the result value of the code below will not be identical
00184   // (as it's affected by the byte-order of the host processor).
00185 
00186   // (A web search for "UnsafeHash.m3" should turn up the original.  I
00187   // considered simply reproducing the original description/proof
00188   // here, but that would undoubtedly cause license/copyright
00189   // issues. --KCS)
00190 {
00191     const PointerInt modNmask = (Word)N - 1UL;
00192     Word temp = 0UL;
00193     const char *p = this->s;
00194     Word m = (Word)strlen(p);
00195     const char *endp = p + m;
00196 
00197 #if defined(VALGRIND_SUPPORT)
00198     // Lower performance but completely safe, even as far as paranoid
00199     // run-time checkers like valgrind are concerned.
00200     char *temp_c = ((char *) &temp);
00201     for(unsigned int i = 0; i < m; i++)
00202       {
00203         temp_c[i % sizeof(temp)] ^= this->s[i];
00204       }
00205     return m + temp;
00206 #else
00207     // process at most N-1 initial chars if "p" is not word-aligned
00208     PointerInt jpre = (PointerInt)p & modNmask, jpost;
00209     if ((jpre != 0) && (p != endp))
00210       {
00211         // jpost is the number of junk bytes in the first word after
00212         // the non-junk bytes.  (This is only non-zero for very short
00213         // strings.)
00214         jpost = max(0, N - jpre - m);
00215 
00216         // Get the Word-aligned memory that contains the start of the
00217         // string (and jpre preceeding junk bytes).
00218         temp = *((Word *)(p-jpre));
00219 
00220         // Shift temp to remove preceeding junk bytes and following
00221         // junk bytes (to handle the case of very short strings where
00222         // jpost > 0).
00223 #if BYTE_ORDER == BIG_ENDIAN
00224         temp <<= (jpre << LgUp1);
00225         temp >>= ((jpost + jpre) << LgUp1);
00226 #else
00227         temp >>= (jpre << LgUp1);
00228         temp <<= ((jpost + jpre) << LgUp1);
00229 #endif
00230 
00231         // Skip forward to the next word-aligned chunk of the string
00232         // or the end of the string, whichever comes first.
00233         p += (N - jpre - jpost);
00234       }
00235 
00236     // process middle words
00237     while (p + N <= endp) {
00238         temp ^= *((Word *)p);
00239         p += N;
00240     }
00241     
00242     // process at most N-1 trailing chars
00243     if (p != endp)
00244       {
00245         // w is the last Word-aligned chunk of the string, including
00246         // some trailing junk characters.
00247         Word w = *((Word *)p);
00248 
00249         // jpost is the number of trailing junk characters in this
00250         // Word but not in the string.  jpostUp1 is the number of bits
00251         // in jpost bytes.
00252         jpost = N - (endp - p);
00253         Word jpostUp1 = (jpost << LgUp1);
00254 
00255         // Shift w to remove bytes past the end of the string and
00256         // thereby avoid including random garbage in the hash value.
00257         // Rotate temp by the same amount in the same direction as the
00258         // shift to align it with w.
00259 #if BYTE_ORDER == BIG_ENDIAN
00260         w >>= jpostUp1;
00261         temp = RotateWord(temp, -jpostUp1);
00262 #else
00263         w <<= jpostUp1;
00264         temp = RotateWord(temp, jpostUp1);
00265 #endif
00266 
00267         // Combine w (now free of junk bytes) into the working hash
00268         // value.
00269         temp ^= w;
00270     }
00271 
00272     // Finally, we rotate the hash value by the same number of bytes
00273     // as the string length and add the string length to the result
00274     // (to ensure that strings which are an even number of identical
00275     // repitions of Word-sized sequences don't all get the same hash
00276     // value).
00277 #if BYTE_ORDER == BIG_ENDIAN
00278     return m + RotateWord(temp, -(m << LgUp1));
00279 #else
00280     return m + RotateWord(temp, (m << LgUp1));
00281 #endif
00282 
00283 #endif // SAFE_HASH
00284 }
00285 
00286 Text Text::WordWrap(const Text &prefix, unsigned int columns)
00287   const throw ()
00288 {
00289   Text result = prefix;
00290   unsigned int line_len = result.Length();
00291   const char *read_p = s;
00292 
00293   // Until we run out of original...
00294   while(*read_p)
00295     {
00296       // Find the beginning and end of the next word.
00297       const char *next_word_start = read_p, *next_word_end;
00298       while(*next_word_start && isspace(*next_word_start))
00299         next_word_start++;
00300       next_word_end = next_word_start;
00301       while(*next_word_end && !isspace(*next_word_end))
00302         next_word_end++;
00303 
00304       // See if adding this word to the current like would put us over
00305       // the line length limit.
00306       if((line_len + (next_word_end - read_p)) > columns)
00307         {
00308           // Replace the whitespace before this word with a newline,
00309           // and update the line length counter.
00310           unsigned int next_word_len = (next_word_end - next_word_start);
00311           result += (Text("\n") + prefix +
00312                      Text(next_word_start, next_word_len));
00313           line_len = prefix.Length() + next_word_len;
00314         }
00315       // If this would still be within the line length limit, just
00316       // copy them over.
00317       else
00318         {
00319           unsigned int added_len = (next_word_end - read_p);
00320           result += Text(read_p, added_len);
00321           line_len += added_len;
00322         }
00323 
00324       // Regardless of what we did, move past the next word.
00325       read_p = next_word_end;
00326     }
00327 
00328   // Return the result.
00329   return result;
00330 }
00331 
00332 // Shared code for PadLeft and PadRight: computes the number of copies
00333 // needed (including possibly a partial copy) and allocates the
00334 // storage for the new string.
00335 
00336 static void PaddingCopies(unsigned int baseLen,
00337                           unsigned int finalLen,
00338                           unsigned int padLen,
00339                           /*OUT*/ unsigned int &copies,
00340                           /*OUT*/ unsigned int &partial,
00341                           /*OUT*/ char *&dest)
00342 {
00343   assert(padLen > 0);
00344   unsigned int needed = (finalLen > baseLen) ? (finalLen - baseLen) : 0;
00345   copies = needed/padLen;
00346   partial = needed - (copies*padLen);
00347   unsigned int totalLen = baseLen + (copies*padLen) + partial;
00348   assert(totalLen >= finalLen);
00349   dest = NEW_PTRFREE_ARRAY(char, totalLen+1);
00350   *dest = 0;
00351 }
00352 
00353 // Shared code for PadLeft and PadRight: copies the padding string
00354 // multiple times (including possibly a final partial copy).
00355 
00356 static void CopyPadding(unsigned int copies, unsigned int partial,
00357                         char *dest, const Text &padding)
00358 {
00359   while(copies > 0)
00360     {
00361       strcat(dest, padding.cchars());
00362       copies--;
00363     }
00364   if(partial)
00365     {
00366       strncat(dest, padding.cchars(), partial);
00367     }
00368 }
00369 
00370 Text Text::PadLeft(unsigned int toLen, const Text &padding)
00371   const throw()
00372 {
00373 
00374   unsigned int copies, partial;
00375   char *dest;
00376   PaddingCopies(this->Length(), toLen, padding.Length(),
00377                 copies, partial, dest);
00378 
00379   // Padding on the left
00380   CopyPadding(copies, partial, dest, padding);
00381   // Original on the right
00382   strcat(dest, this->s);
00383 
00384   Text res;
00385   res.s = dest;
00386   return res;
00387 }
00388 
00389 Text Text::PadRight(unsigned int toLen, const Text &padding)
00390   const throw()
00391 {
00392   unsigned int copies, partial;
00393   char *dest;
00394   PaddingCopies(this->Length(), toLen, padding.Length(),
00395                 copies, partial, dest);
00396 
00397   // Oritinal on the left
00398   strcat(dest, this->s);
00399   // Padding on the right
00400   CopyPadding(copies, partial, dest, padding);
00401 
00402   Text res;
00403   res.s = dest;
00404   return res;
00405 }