• Skip to content
  • Skip to link menu
KDE 4.2 API Reference
  • KDE API Reference
  • KDE-PIM Libraries
  • Sitemap
  • Contact Us
 

KMIME Library

kmime_header_parsing.cpp

00001 /*  -*- c++ -*-
00002     kmime_header_parsing.cpp
00003 
00004     KMime, the KDE internet mail/usenet news message library.
00005     Copyright (c) 2001-2002 Marc Mutz <mutz@kde.org>
00006 
00007     This library is free software; you can redistribute it and/or
00008     modify it under the terms of the GNU Library General Public
00009     License as published by the Free Software Foundation; either
00010     version 2 of the License, or (at your option) any later version.
00011 
00012     This library is distributed in the hope that it will be useful,
00013     but WITHOUT ANY WARRANTY; without even the implied warranty of
00014     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015     Library General Public License for more details.
00016 
00017     You should have received a copy of the GNU Library General Public License
00018     along with this library; see the file COPYING.LIB.  If not, write to
00019     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00020     Boston, MA 02110-1301, USA.
00021 */
00022 
00023 #include "kmime_header_parsing.h"
00024 
00025 #include "kmime_codecs.h"
00026 #include "kmime_util.h"
00027 #include "kmime_dateformatter.h"
00028 #include "kmime_warning.h"
00029 
00030 #include <kglobal.h>
00031 #include <kcharsets.h>
00032 
00033 #include <QtCore/QTextCodec>
00034 #include <QtCore/QMap>
00035 #include <QtCore/QStringList>
00036 #include <QtCore/QUrl>
00037 
00038 #include <ctype.h> // for isdigit
00039 #include <cassert>
00040 
00041 using namespace KMime;
00042 using namespace KMime::Types;
00043 
00044 namespace KMime {
00045 
00046 namespace Types {
00047 
00048 static QString addr_spec_as_string( const AddrSpec & as, bool pretty )
00049 {
00050   if ( as.isEmpty() ) {
00051     return QString();
00052   }
00053 
00054   bool needsQuotes = false;
00055   QString result;
00056   result.reserve( as.localPart.length() + as.domain.length() + 1 );
00057   for ( int i = 0 ; i < as.localPart.length() ; ++i ) {
00058     const char ch = as.localPart[i].toLatin1();
00059     if ( ch == '.' || isAText( ch ) ) {
00060       result += ch;
00061     } else {
00062       needsQuotes = true;
00063       if ( ch == '\\' || ch == '"' ) {
00064         result += '\\';
00065       }
00066       result += ch;
00067     }
00068   }
00069   const QString dom = pretty ? QUrl::fromAce( as.domain.toLatin1() ) : as.domain ;
00070   if ( needsQuotes ) {
00071     return '"' + result + "\"@" + dom;
00072   } else {
00073     return result + '@' + dom;
00074   }
00075 }
00076 
00077 QString AddrSpec::asString() const
00078 {
00079     return addr_spec_as_string( *this, false );
00080 }
00081 
00082 QString AddrSpec::asPrettyString() const
00083 {
00084     return addr_spec_as_string( *this, true );
00085 }
00086 
00087 bool AddrSpec::isEmpty() const
00088 {
00089   return localPart.isEmpty() && domain.isEmpty();
00090 }
00091 
00092 QByteArray Mailbox::address() const
00093 {
00094   return mAddrSpec.asString().toLatin1();
00095 }
00096 
00097 AddrSpec Mailbox::addrSpec() const
00098 {
00099   return mAddrSpec;
00100 }
00101 
00102 QString Mailbox::name() const
00103 {
00104   return mDisplayName;
00105 }
00106 
00107 void Mailbox::setAddress( const AddrSpec &addr )
00108 {
00109   mAddrSpec = addr;
00110 }
00111 
00112 void Mailbox::setAddress( const QByteArray &addr )
00113 {
00114   const char *cursor = addr.constData();
00115   if ( !HeaderParsing::parseAngleAddr( cursor,
00116                                        cursor + addr.length(), mAddrSpec ) ) {
00117     if ( !HeaderParsing::parseAddrSpec( cursor, cursor + addr.length(),
00118                                         mAddrSpec ) ) {
00119       kWarning() << "Invalid address";
00120       return;
00121     }
00122   }
00123 }
00124 
00125 void Mailbox::setName( const QString &name )
00126 {
00127   mDisplayName = name;
00128 }
00129 
00130 void Mailbox::setNameFrom7Bit( const QByteArray &name,
00131                                const QByteArray &defaultCharset )
00132 {
00133   QByteArray cs;
00134   mDisplayName = decodeRFC2047String( name, cs, defaultCharset, false );
00135 }
00136 
00137 bool Mailbox::hasAddress() const
00138 {
00139   return !mAddrSpec.isEmpty();
00140 }
00141 
00142 bool Mailbox::hasName() const
00143 {
00144   return !mDisplayName.isEmpty();
00145 }
00146 
00147 QString Mailbox::prettyAddress() const
00148 {
00149   if ( !hasName() ) {
00150     return address();
00151   }
00152   QString s = name();
00153   if ( hasAddress() ) {
00154     s += QLatin1String(" <") + address() + QLatin1Char('>');
00155   }
00156   return s;
00157 }
00158 
00159 void Mailbox::fromUnicodeString( const QString &s )
00160 {
00161   from7BitString( encodeRFC2047String( s, "utf-8", false ) );
00162 }
00163 
00164 void Mailbox::from7BitString( const QByteArray &s )
00165 {
00166   const char *cursor = s.constData();
00167   HeaderParsing::parseMailbox( cursor, cursor + s.length(), *this );
00168 }
00169 
00170 QByteArray KMime::Types::Mailbox::as7BitString( const QByteArray &encCharset ) const
00171 {
00172   if ( !hasName() ) {
00173     return address();
00174   }
00175   QByteArray rv;
00176   if ( isUsAscii( name() ) ) {
00177     QByteArray tmp = name().toLatin1();
00178     addQuotes( tmp, false );
00179     rv += tmp;
00180   } else {
00181     rv += encodeRFC2047String( name(), encCharset, true );
00182   }
00183   if ( hasAddress() ) {
00184     rv += " <" + address() + '>';
00185   }
00186   return rv;
00187 }
00188 
00189 } // namespace Types
00190 
00191 namespace HeaderParsing {
00192 
00193 // parse the encoded-word (scursor points to after the initial '=')
00194 bool parseEncodedWord( const char* &scursor, const char * const send,
00195                        QString &result, QByteArray &language,
00196                        QByteArray &usedCS, const QByteArray &defaultCS,
00197                        bool forceCS )
00198 {
00199   // make sure the caller already did a bit of the work.
00200   assert( *(scursor-1) == '=' );
00201 
00202   //
00203   // STEP 1:
00204   // scan for the charset/language portion of the encoded-word
00205   //
00206 
00207   char ch = *scursor++;
00208 
00209   if ( ch != '?' ) {
00210     // kDebug(5320) << "first";
00211     KMIME_WARN_PREMATURE_END_OF( EncodedWord );
00212     return false;
00213   }
00214 
00215   // remember start of charset (ie. just after the initial "=?") and
00216   // language (just after the first '*') fields:
00217   const char * charsetStart = scursor;
00218   const char * languageStart = 0;
00219 
00220   // find delimiting '?' (and the '*' separating charset and language
00221   // tags, if any):
00222   for ( ; scursor != send ; scursor++ ) {
00223     if ( *scursor == '?') {
00224       break;
00225     } else if ( *scursor == '*' && languageStart == 0 ) {
00226       languageStart = scursor + 1;
00227     }
00228   }
00229 
00230   // not found? can't be an encoded-word!
00231   if ( scursor == send || *scursor != '?' ) {
00232     // kDebug(5320) << "second";
00233     KMIME_WARN_PREMATURE_END_OF( EncodedWord );
00234     return false;
00235   }
00236 
00237   // extract the language information, if any (if languageStart is 0,
00238   // language will be null, too):
00239   QByteArray maybeLanguage( languageStart, scursor - languageStart );
00240   // extract charset information (keep in mind: the size given to the
00241   // ctor is one off due to the \0 terminator):
00242   QByteArray maybeCharset( charsetStart,
00243                            ( languageStart ? languageStart - 1 : scursor ) - charsetStart );
00244 
00245   //
00246   // STEP 2:
00247   // scan for the encoding portion of the encoded-word
00248   //
00249 
00250   // remember start of encoding (just _after_ the second '?'):
00251   scursor++;
00252   const char * encodingStart = scursor;
00253 
00254   // find next '?' (ending the encoding tag):
00255   for ( ; scursor != send ; scursor++ ) {
00256     if ( *scursor == '?' ) {
00257       break;
00258     }
00259   }
00260 
00261   // not found? Can't be an encoded-word!
00262   if ( scursor == send || *scursor != '?' ) {
00263     // kDebug(5320) << "third";
00264     KMIME_WARN_PREMATURE_END_OF( EncodedWord );
00265     return false;
00266   }
00267 
00268   // extract the encoding information:
00269   QByteArray maybeEncoding( encodingStart, scursor - encodingStart );
00270 
00271   // kDebug(5320) << "parseEncodedWord: found charset == \"" << maybeCharset
00272   //         << "\"; language == \"" << maybeLanguage
00273   //         << "\"; encoding == \"" << maybeEncoding << "\"";
00274 
00275   //
00276   // STEP 3:
00277   // scan for encoded-text portion of encoded-word
00278   //
00279 
00280   // remember start of encoded-text (just after the third '?'):
00281   scursor++;
00282   const char * encodedTextStart = scursor;
00283 
00284   // find next '?' (ending the encoded-text):
00285   for ( ; scursor != send ; scursor++ ) {
00286     if ( *scursor == '?' ) {
00287       break;
00288     }
00289   }
00290 
00291   // not found? Can't be an encoded-word!
00292   // ### maybe evaluate it nonetheless if the rest is OK?
00293   if ( scursor == send || *scursor != '?' ) {
00294     // kDebug(5320) << "fourth";
00295     KMIME_WARN_PREMATURE_END_OF( EncodedWord );
00296     return false;
00297   }
00298   scursor++;
00299   // check for trailing '=':
00300   if ( scursor == send || *scursor != '=' ) {
00301     // kDebug(5320) << "fifth";
00302     KMIME_WARN_PREMATURE_END_OF( EncodedWord );
00303     return false;
00304   }
00305   scursor++;
00306 
00307   // set end sentinel for encoded-text:
00308   const char * const encodedTextEnd = scursor - 2;
00309 
00310   //
00311   // STEP 4:
00312   // setup decoders for the transfer encoding and the charset
00313   //
00314 
00315   // try if there's a codec for the encoding found:
00316   Codec * codec = Codec::codecForName( maybeEncoding );
00317   if ( !codec ) {
00318     KMIME_WARN_UNKNOWN( Encoding, maybeEncoding );
00319     return false;
00320   }
00321 
00322   // get an instance of a corresponding decoder:
00323   Decoder * dec = codec->makeDecoder();
00324   assert( dec );
00325 
00326   // try if there's a (text)codec for the charset found:
00327   bool matchOK = false;
00328   QTextCodec *textCodec = 0;
00329   if ( forceCS || maybeCharset.isEmpty() ) {
00330     textCodec = KGlobal::charsets()->codecForName( defaultCS, matchOK );
00331     usedCS = cachedCharset( defaultCS );
00332   } else {
00333     textCodec = KGlobal::charsets()->codecForName( maybeCharset, matchOK );
00334     if ( !matchOK ) {  //no suitable codec found => use default charset
00335       textCodec = KGlobal::charsets()->codecForName( defaultCS, matchOK );
00336       usedCS = cachedCharset( defaultCS );
00337     } else {
00338       usedCS = cachedCharset( maybeCharset );
00339     }
00340   }
00341 
00342   if ( !matchOK || !textCodec ) {
00343     KMIME_WARN_UNKNOWN( Charset, maybeCharset );
00344     delete dec;
00345     return false;
00346   };
00347 
00348   // kDebug(5320) << "mimeName(): \"" << textCodec->name() << "\"";
00349 
00350   // allocate a temporary buffer to store the 8bit text:
00351   int encodedTextLength = encodedTextEnd - encodedTextStart;
00352   QByteArray buffer;
00353   buffer.resize( codec->maxDecodedSizeFor( encodedTextLength ) );
00354   QByteArray::Iterator bit = buffer.begin();
00355   QByteArray::ConstIterator bend = buffer.end();
00356 
00357   //
00358   // STEP 5:
00359   // do the actual decoding
00360   //
00361 
00362   if ( !dec->decode( encodedTextStart, encodedTextEnd, bit, bend ) ) {
00363     KMIME_WARN << codec->name() << "codec lies about its maxDecodedSizeFor("
00364                << encodedTextLength << ")\nresult may be truncated";
00365   }
00366 
00367   result = textCodec->toUnicode( buffer.begin(), bit - buffer.begin() );
00368 
00369   // kDebug(5320) << "result now: \"" << result << "\"";
00370   // cleanup:
00371   delete dec;
00372   language = maybeLanguage;
00373 
00374   return true;
00375 }
00376 
00377 static inline void eatWhiteSpace( const char* &scursor, const char * const send )
00378 {
00379   while ( scursor != send &&
00380           ( *scursor == ' ' || *scursor == '\n' ||
00381             *scursor == '\t' || *scursor == '\r' ) )
00382     scursor++;
00383 }
00384 
00385 bool parseAtom( const char * &scursor, const char * const send,
00386                 QString &result, bool allow8Bit )
00387 {
00388   QPair<const char*,int> maybeResult;
00389 
00390   if ( parseAtom( scursor, send, maybeResult, allow8Bit ) ) {
00391     result += QString::fromLatin1( maybeResult.first, maybeResult.second );
00392     return true;
00393   }
00394 
00395   return false;
00396 }
00397 
00398 bool parseAtom( const char * &scursor, const char * const send,
00399                 QPair<const char*,int> &result, bool allow8Bit )
00400 {
00401   bool success = false;
00402   const char *start = scursor;
00403 
00404   while ( scursor != send ) {
00405     signed char ch = *scursor++;
00406     if ( ch > 0 && isAText( ch ) ) {
00407       // AText: OK
00408       success = true;
00409     } else if ( allow8Bit && ch < 0 ) {
00410       // 8bit char: not OK, but be tolerant.
00411       KMIME_WARN_8BIT( ch );
00412       success = true;
00413     } else {
00414       // CTL or special - marking the end of the atom:
00415       // re-set sursor to point to the offending
00416       // char and return:
00417       scursor--;
00418       break;
00419     }
00420   }
00421   result.first = start;
00422   result.second = scursor - start;
00423   return success;
00424 }
00425 
00426 bool parseToken( const char * &scursor, const char * const send,
00427                  QString &result, bool allow8Bit )
00428 {
00429   QPair<const char*,int> maybeResult;
00430 
00431   if ( parseToken( scursor, send, maybeResult, allow8Bit ) ) {
00432     result += QString::fromLatin1( maybeResult.first, maybeResult.second );
00433     return true;
00434   }
00435 
00436   return false;
00437 }
00438 
00439 bool parseToken( const char * &scursor, const char * const send,
00440                  QPair<const char*,int> &result, bool allow8Bit )
00441 {
00442   bool success = false;
00443   const char * start = scursor;
00444 
00445   while ( scursor != send ) {
00446     signed char ch = *scursor++;
00447     if ( ch > 0 && isTText( ch ) ) {
00448       // TText: OK
00449       success = true;
00450     } else if ( allow8Bit && ch < 0 ) {
00451       // 8bit char: not OK, but be tolerant.
00452       KMIME_WARN_8BIT( ch );
00453       success = true;
00454     } else {
00455       // CTL or tspecial - marking the end of the atom:
00456       // re-set sursor to point to the offending
00457       // char and return:
00458       scursor--;
00459       break;
00460     }
00461   }
00462   result.first = start;
00463   result.second = scursor - start;
00464   return success;
00465 }
00466 
00467 #define READ_ch_OR_FAIL if ( scursor == send ) {        \
00468     KMIME_WARN_PREMATURE_END_OF( GenericQuotedString ); \
00469     return false;                                       \
00470   } else {                                              \
00471     ch = *scursor++;                                    \
00472   }
00473 
00474 // known issues:
00475 //
00476 // - doesn't handle quoted CRLF
00477 
00478 bool parseGenericQuotedString( const char* &scursor, const char * const send,
00479                                QString &result, bool isCRLF,
00480                                const char openChar, const char closeChar )
00481 {
00482   char ch;
00483   // We are in a quoted-string or domain-literal or comment and the
00484   // cursor points to the first char after the openChar.
00485   // We will apply unfolding and quoted-pair removal.
00486   // We return when we either encounter the end or unescaped openChar
00487   // or closeChar.
00488 
00489   assert( *(scursor-1) == openChar || *(scursor-1) == closeChar );
00490 
00491   while ( scursor != send ) {
00492     ch = *scursor++;
00493 
00494     if ( ch == closeChar || ch == openChar ) {
00495       // end of quoted-string or another opening char:
00496       // let caller decide what to do.
00497       return true;
00498     }
00499 
00500     switch( ch ) {
00501     case '\\':      // quoted-pair
00502       // misses "\" CRLF LWSP-char handling, see rfc822, 3.4.5
00503       READ_ch_OR_FAIL;
00504       KMIME_WARN_IF_8BIT( ch );
00505       result += QChar( ch );
00506       break;
00507     case '\r':
00508       // ###
00509       // The case of lonely '\r' is easy to solve, as they're
00510       // not part of Unix Line-ending conventions.
00511       // But I see a problem if we are given Unix-native
00512       // line-ending-mails, where we cannot determine anymore
00513       // whether a given '\n' was part of a CRLF or was occurring
00514       // on it's own.
00515       READ_ch_OR_FAIL;
00516       if ( ch != '\n' ) {
00517         // CR on it's own...
00518         KMIME_WARN_LONE( CR );
00519         result += QChar('\r');
00520         scursor--; // points to after the '\r' again
00521       } else {
00522         // CRLF encountered.
00523         // lookahead: check for folding
00524         READ_ch_OR_FAIL;
00525         if ( ch == ' ' || ch == '\t' ) {
00526           // correct folding;
00527           // position cursor behind the CRLF WSP (unfolding)
00528           // and add the WSP to the result
00529           result += QChar( ch );
00530         } else {
00531           // this is the "shouldn't happen"-case. There is a CRLF
00532           // inside a quoted-string without it being part of FWS.
00533           // We take it verbatim.
00534           KMIME_WARN_NON_FOLDING( CRLF );
00535           result += "\r\n";
00536           // the cursor is decremented again, so's we need not
00537           // duplicate the whole switch here. "ch" could've been
00538           // everything (incl. openChar or closeChar).
00539           scursor--;
00540         }
00541       }
00542       break;
00543     case '\n':
00544       // Note: CRLF has been handled above already!
00545       // ### LF needs special treatment, depending on whether isCRLF
00546       // is true (we can be sure a lonely '\n' was meant this way) or
00547       // false ('\n' alone could have meant LF or CRLF in the original
00548       // message. This parser assumes CRLF iff the LF is followed by
00549       // either WSP (folding) or NULL (premature end of quoted-string;
00550       // Should be fixed, since NULL is allowed as per rfc822).
00551       READ_ch_OR_FAIL;
00552       if ( !isCRLF && ( ch == ' ' || ch == '\t' ) ) {
00553         // folding
00554         // correct folding
00555         result += QChar( ch );
00556       } else {
00557         // non-folding
00558         KMIME_WARN_LONE( LF );
00559         result += QChar('\n');
00560         // pos is decremented, so's we need not duplicate the whole
00561         // switch here. ch could've been everything (incl. <">, "\").
00562         scursor--;
00563       }
00564       break;
00565     default:
00566       KMIME_WARN_IF_8BIT( ch );
00567       result += QChar( ch );
00568     }
00569   }
00570 
00571   return false;
00572 }
00573 
00574 // known issues:
00575 //
00576 // - doesn't handle encoded-word inside comments.
00577 
00578 bool parseComment( const char* &scursor, const char * const send,
00579                    QString &result, bool isCRLF, bool reallySave )
00580 {
00581   int commentNestingDepth = 1;
00582   const char *afterLastClosingParenPos = 0;
00583   QString maybeCmnt;
00584   const char *oldscursor = scursor;
00585 
00586   assert( *(scursor-1) == '(' );
00587 
00588   while ( commentNestingDepth ) {
00589     QString cmntPart;
00590     if ( parseGenericQuotedString( scursor, send, cmntPart, isCRLF, '(', ')' ) ) {
00591       assert( *(scursor-1) == ')' || *(scursor-1) == '(' );
00592       // see the kdoc for above function for the possible conditions
00593       // we have to check:
00594       switch ( *(scursor-1) ) {
00595       case ')':
00596         if ( reallySave ) {
00597           // add the chunk that's now surely inside the comment.
00598           result += maybeCmnt;
00599           result += cmntPart;
00600           if ( commentNestingDepth > 1 ) {
00601             // don't add the outermost ')'...
00602             result += QChar(')');
00603           }
00604           maybeCmnt.clear();
00605         }
00606         afterLastClosingParenPos = scursor;
00607         --commentNestingDepth;
00608         break;
00609       case '(':
00610         if ( reallySave ) {
00611           // don't add to "result" yet, because we might find that we
00612           // are already outside the (broken) comment...
00613           maybeCmnt += cmntPart;
00614           maybeCmnt += QChar('(');
00615         }
00616         ++commentNestingDepth;
00617         break;
00618       default: assert( 0 );
00619       } // switch
00620     } else {
00621       // !parseGenericQuotedString, ie. premature end
00622       if ( afterLastClosingParenPos ) {
00623         scursor = afterLastClosingParenPos;
00624       } else {
00625         scursor = oldscursor;
00626       }
00627       return false;
00628     }
00629   } // while
00630 
00631   return true;
00632 }
00633 
00634 // known issues: none.
00635 
00636 bool parsePhrase( const char* &scursor, const char * const send,
00637                   QString &result, bool isCRLF )
00638 {
00639   enum {
00640     None, Phrase, Atom, EncodedWord, QuotedString
00641   } found = None;
00642 
00643   QString tmp;
00644   QByteArray lang, charset;
00645   const char *successfullyParsed = 0;
00646   // only used by the encoded-word branch
00647   const char *oldscursor;
00648   // used to suppress whitespace between adjacent encoded-words
00649   // (rfc2047, 6.2):
00650   bool lastWasEncodedWord = false;
00651 
00652   while ( scursor != send ) {
00653     char ch = *scursor++;
00654     switch ( ch ) {
00655     case '.': // broken, but allow for intorop's sake
00656       if ( found == None ) {
00657         --scursor;
00658         return false;
00659       } else {
00660         if ( scursor != send && ( *scursor == ' ' || *scursor == '\t' ) ) {
00661           result += ". ";
00662         } else {
00663           result += '.';
00664         }
00665         successfullyParsed = scursor;
00666       }
00667       break;
00668     case '"': // quoted-string
00669       tmp.clear();
00670       if ( parseGenericQuotedString( scursor, send, tmp, isCRLF, '"', '"' ) ) {
00671         successfullyParsed = scursor;
00672         assert( *(scursor-1) == '"' );
00673         switch ( found ) {
00674         case None:
00675           found = QuotedString;
00676           break;
00677         case Phrase:
00678         case Atom:
00679         case EncodedWord:
00680         case QuotedString:
00681           found = Phrase;
00682           result += QChar(' '); // rfc822, 3.4.4
00683           break;
00684         default:
00685           assert( 0 );
00686         }
00687         lastWasEncodedWord = false;
00688         result += tmp;
00689       } else {
00690         // premature end of quoted string.
00691         // What to do? Return leading '"' as special? Return as quoted-string?
00692         // We do the latter if we already found something, else signal failure.
00693         if ( found == None ) {
00694           return false;
00695         } else {
00696           result += QChar(' '); // rfc822, 3.4.4
00697           result += tmp;
00698           return true;
00699         }
00700       }
00701       break;
00702     case '(': // comment
00703       // parse it, but ignore content:
00704       tmp.clear();
00705       if ( parseComment( scursor, send, tmp, isCRLF,
00706                          false /*don't bother with the content*/ ) ) {
00707         successfullyParsed = scursor;
00708         lastWasEncodedWord = false; // strictly interpreting rfc2047, 6.2
00709       } else {
00710         if ( found == None ) {
00711           return false;
00712         } else {
00713           scursor = successfullyParsed;
00714           return true;
00715         }
00716       }
00717       break;
00718     case '=': // encoded-word
00719       tmp.clear();
00720       oldscursor = scursor;
00721       lang.clear();
00722       charset.clear();
00723       if ( parseEncodedWord( scursor, send, tmp, lang, charset ) ) {
00724         successfullyParsed = scursor;
00725         switch ( found ) {
00726         case None:
00727           found = EncodedWord;
00728           break;
00729         case Phrase:
00730         case EncodedWord:
00731         case Atom:
00732         case QuotedString:
00733           if ( !lastWasEncodedWord ) {
00734             result += QChar(' '); // rfc822, 3.4.4
00735           }
00736           found = Phrase;
00737           break;
00738         default: assert( 0 );
00739         }
00740         lastWasEncodedWord = true;
00741         result += tmp;
00742         break;
00743       } else {
00744         // parse as atom:
00745         scursor = oldscursor;
00746       }
00747       // fall though...
00748 
00749     default: //atom
00750       tmp.clear();
00751       scursor--;
00752       if ( parseAtom( scursor, send, tmp, true /* allow 8bit */ ) ) {
00753         successfullyParsed = scursor;
00754         switch ( found ) {
00755         case None:
00756           found = Atom;
00757           break;
00758         case Phrase:
00759         case Atom:
00760         case EncodedWord:
00761         case QuotedString:
00762           found = Phrase;
00763           result += QChar(' '); // rfc822, 3.4.4
00764           break;
00765         default:
00766           assert( 0 );
00767         }
00768         lastWasEncodedWord = false;
00769         result += tmp;
00770       } else {
00771         if ( found == None ) {
00772           return false;
00773         } else {
00774           scursor = successfullyParsed;
00775           return true;
00776         }
00777       }
00778     }
00779     eatWhiteSpace( scursor, send );
00780   }
00781 
00782   return found != None;
00783 }
00784 
00785 bool parseDotAtom( const char* &scursor, const char * const send,
00786                    QString &result, bool isCRLF )
00787 {
00788   eatCFWS( scursor, send, isCRLF );
00789 
00790   // always points to just after the last atom parsed:
00791   const char *successfullyParsed;
00792 
00793   QString tmp;
00794   if ( !parseAtom( scursor, send, tmp, false /* no 8bit */ ) ) {
00795     return false;
00796   }
00797   result += tmp;
00798   successfullyParsed = scursor;
00799 
00800   while ( scursor != send ) {
00801 
00802     // end of header or no '.' -> return
00803     if ( scursor == send || *scursor != '.' ) {
00804       return true;
00805     }
00806     scursor++; // eat '.'
00807 
00808     if ( scursor == send || !isAText( *scursor ) ) {
00809       // end of header or no AText, but this time following a '.'!:
00810       // reset cursor to just after last successfully parsed char and
00811       // return:
00812       scursor = successfullyParsed;
00813       return true;
00814     }
00815 
00816     // try to parse the next atom:
00817     QString maybeAtom;
00818     if ( !parseAtom( scursor, send, maybeAtom, false /*no 8bit*/ ) ) {
00819       scursor = successfullyParsed;
00820       return true;
00821     }
00822 
00823     result += QChar('.');
00824     result += maybeAtom;
00825     successfullyParsed = scursor;
00826   }
00827 
00828   scursor = successfullyParsed;
00829   return true;
00830 }
00831 
00832 void eatCFWS( const char* &scursor, const char * const send, bool isCRLF )
00833 {
00834   QString dummy;
00835 
00836   while ( scursor != send ) {
00837     const char *oldscursor = scursor;
00838 
00839     char ch = *scursor++;
00840 
00841     switch( ch ) {
00842     case ' ':
00843     case '\t': // whitespace
00844     case '\r':
00845     case '\n': // folding
00846       continue;
00847 
00848     case '(': // comment
00849       if ( parseComment( scursor, send, dummy, isCRLF, false /*don't save*/ ) ) {
00850         continue;
00851       }
00852       scursor = oldscursor;
00853       return;
00854 
00855     default:
00856       scursor = oldscursor;
00857       return;
00858     }
00859   }
00860 }
00861 
00862 bool parseDomain( const char* &scursor, const char * const send,
00863                   QString &result, bool isCRLF )
00864 {
00865   eatCFWS( scursor, send, isCRLF );
00866   if ( scursor == send ) {
00867     return false;
00868   }
00869 
00870   // domain := dot-atom / domain-literal / atom *("." atom)
00871   //
00872   // equivalent to:
00873   // domain = dot-atom / domain-literal,
00874   // since parseDotAtom does allow CFWS between atoms and dots
00875 
00876   if ( *scursor == '[' ) {
00877     // domain-literal:
00878     QString maybeDomainLiteral;
00879     // eat '[':
00880     scursor++;
00881     while ( parseGenericQuotedString( scursor, send, maybeDomainLiteral,
00882                                       isCRLF, '[', ']' ) ) {
00883       if ( scursor == send ) {
00884         // end of header: check for closing ']':
00885         if ( *(scursor-1) == ']' ) {
00886           // OK, last char was ']':
00887           result = maybeDomainLiteral;
00888           return true;
00889         } else {
00890           // not OK, domain-literal wasn't closed:
00891           return false;
00892         }
00893       }
00894       // we hit openChar in parseGenericQuotedString.
00895       // include it in maybeDomainLiteral and keep on parsing:
00896       if ( *(scursor-1) == '[' ) {
00897         maybeDomainLiteral += QChar('[');
00898         continue;
00899       }
00900       // OK, real end of domain-literal:
00901       result = maybeDomainLiteral;
00902       return true;
00903     }
00904   } else {
00905     // dot-atom:
00906     QString maybeDotAtom;
00907     if ( parseDotAtom( scursor, send, maybeDotAtom, isCRLF ) ) {
00908       result = maybeDotAtom;
00909       return true;
00910     }
00911   }
00912   return false;
00913 }
00914 
00915 bool parseObsRoute( const char* &scursor, const char* const send,
00916                     QStringList &result, bool isCRLF, bool save )
00917 {
00918   while ( scursor != send ) {
00919     eatCFWS( scursor, send, isCRLF );
00920     if ( scursor == send ) {
00921       return false;
00922     }
00923 
00924     // empty entry:
00925     if ( *scursor == ',' ) {
00926       scursor++;
00927       if ( save ) {
00928         result.append( QString() );
00929       }
00930       continue;
00931     }
00932 
00933     // empty entry ending the list:
00934     if ( *scursor == ':' ) {
00935       scursor++;
00936       if ( save ) {
00937         result.append( QString() );
00938       }
00939       return true;
00940     }
00941 
00942     // each non-empty entry must begin with '@':
00943     if ( *scursor != '@' ) {
00944       return false;
00945     } else {
00946       scursor++;
00947     }
00948 
00949     QString maybeDomain;
00950     if ( !parseDomain( scursor, send, maybeDomain, isCRLF ) ) {
00951       return false;
00952     }
00953     if ( save ) {
00954       result.append( maybeDomain );
00955     }
00956 
00957     // eat the following (optional) comma:
00958     eatCFWS( scursor, send, isCRLF );
00959     if ( scursor == send ) {
00960       return false;
00961     }
00962     if ( *scursor == ':' ) {
00963       scursor++;
00964       return true;
00965     }
00966     if ( *scursor == ',' ) {
00967       scursor++;
00968     }
00969   }
00970 
00971   return false;
00972 }
00973 
00974 bool parseAddrSpec( const char* &scursor, const char * const send,
00975                     AddrSpec &result, bool isCRLF )
00976 {
00977   //
00978   // STEP 1:
00979   // local-part := dot-atom / quoted-string / word *("." word)
00980   //
00981   // this is equivalent to:
00982   // local-part := word *("." word)
00983 
00984   QString maybeLocalPart;
00985   QString tmp;
00986 
00987   while ( scursor != send ) {
00988     // first, eat any whitespace
00989     eatCFWS( scursor, send, isCRLF );
00990 
00991     char ch = *scursor++;
00992     switch ( ch ) {
00993     case '.': // dot
00994       maybeLocalPart += QChar('.');
00995       break;
00996 
00997     case '@':
00998       goto SAW_AT_SIGN;
00999       break;
01000 
01001     case '"': // quoted-string
01002       tmp.clear();
01003       if ( parseGenericQuotedString( scursor, send, tmp, isCRLF, '"', '"' ) ) {
01004         maybeLocalPart += tmp;
01005       } else {
01006         return false;
01007       }
01008       break;
01009 
01010     default: // atom
01011       scursor--; // re-set scursor to point to ch again
01012       tmp.clear();
01013       if ( parseAtom( scursor, send, tmp, false /* no 8bit */ ) ) {
01014         maybeLocalPart += tmp;
01015       } else {
01016         return false; // parseAtom can only fail if the first char is non-atext.
01017       }
01018       break;
01019     }
01020   }
01021 
01022   return false;
01023 
01024   //
01025   // STEP 2:
01026   // domain
01027   //
01028 
01029 SAW_AT_SIGN:
01030 
01031   assert( *(scursor-1) == '@' );
01032 
01033   QString maybeDomain;
01034   if ( !parseDomain( scursor, send, maybeDomain, isCRLF ) ) {
01035     return false;
01036   }
01037 
01038   result.localPart = maybeLocalPart;
01039   result.domain = maybeDomain;
01040 
01041   return true;
01042 }
01043 
01044 bool parseAngleAddr( const char* &scursor, const char * const send,
01045                      AddrSpec &result, bool isCRLF )
01046 {
01047   // first, we need an opening angle bracket:
01048   eatCFWS( scursor, send, isCRLF );
01049   if ( scursor == send || *scursor != '<' ) {
01050     return false;
01051   }
01052   scursor++; // eat '<'
01053 
01054   eatCFWS( scursor, send, isCRLF );
01055   if ( scursor == send ) {
01056     return false;
01057   }
01058 
01059   if ( *scursor == '@' || *scursor == ',' ) {
01060     // obs-route: parse, but ignore:
01061     KMIME_WARN << "obsolete source route found! ignoring.";
01062     QStringList dummy;
01063     if ( !parseObsRoute( scursor, send, dummy,
01064                          isCRLF, false /* don't save */ ) ) {
01065       return false;
01066     }
01067     // angle-addr isn't complete until after the '>':
01068     if ( scursor == send ) {
01069       return false;
01070     }
01071   }
01072 
01073   // parse addr-spec:
01074   AddrSpec maybeAddrSpec;
01075   if ( !parseAddrSpec( scursor, send, maybeAddrSpec, isCRLF ) ) {
01076     return false;
01077   }
01078 
01079   eatCFWS( scursor, send, isCRLF );
01080   if ( scursor == send || *scursor != '>' ) {
01081     return false;
01082   }
01083   scursor++;
01084 
01085   result = maybeAddrSpec;
01086   return true;
01087 
01088 }
01089 
01090 bool parseMailbox( const char* &scursor, const char * const send,
01091                    Mailbox &result, bool isCRLF )
01092 {
01093   eatCFWS( scursor, send, isCRLF );
01094   if ( scursor == send ) {
01095     return false;
01096   }
01097 
01098   AddrSpec maybeAddrSpec;
01099   QString maybeDisplayName;
01100 
01101   // first, try if it's a vanilla addr-spec:
01102   const char * oldscursor = scursor;
01103   if ( parseAddrSpec( scursor, send, maybeAddrSpec, isCRLF ) ) {
01104     result.setAddress( maybeAddrSpec );
01105     // check for the obsolete form of display-name (as comment):
01106     eatWhiteSpace( scursor, send );
01107     if ( scursor != send && *scursor == '(' ) {
01108       scursor++;
01109       if ( !parseComment( scursor, send, maybeDisplayName, isCRLF, true /*keep*/ ) ) {
01110         return false;
01111       }
01112     }
01113     result.setNameFrom7Bit( maybeDisplayName.toLatin1() );
01114     return true;
01115   }
01116   scursor = oldscursor;
01117 
01118   // second, see if there's a display-name:
01119   if ( !parsePhrase( scursor, send, maybeDisplayName, isCRLF ) ) {
01120     // failed: reset cursor, note absent display-name
01121     maybeDisplayName.clear();
01122     scursor = oldscursor;
01123   } else {
01124     // succeeded: eat CFWS
01125     eatCFWS( scursor, send, isCRLF );
01126     if ( scursor == send ) {
01127       return false;
01128     }
01129   }
01130 
01131   // third, parse the angle-addr:
01132   if ( !parseAngleAddr( scursor, send, maybeAddrSpec, isCRLF ) ) {
01133     return false;
01134   }
01135 
01136   if ( maybeDisplayName.isNull() ) {
01137     // check for the obsolete form of display-name (as comment):
01138     eatWhiteSpace( scursor, send );
01139     if ( scursor != send && *scursor == '(' ) {
01140       scursor++;
01141       if ( !parseComment( scursor, send, maybeDisplayName, isCRLF, true /*keep*/ ) ) {
01142         return false;
01143       }
01144     }
01145   }
01146 
01147   result.setName( maybeDisplayName );
01148   result.setAddress( maybeAddrSpec );
01149   return true;
01150 }
01151 
01152 bool parseGroup( const char* &scursor, const char * const send,
01153                  Address &result, bool isCRLF )
01154 {
01155   // group         := display-name ":" [ mailbox-list / CFWS ] ";" [CFWS]
01156   //
01157   // equivalent to:
01158   // group   := display-name ":" [ obs-mbox-list ] ";"
01159 
01160   eatCFWS( scursor, send, isCRLF );
01161   if ( scursor == send ) {
01162     return false;
01163   }
01164 
01165   // get display-name:
01166   QString maybeDisplayName;
01167   if ( !parsePhrase( scursor, send, maybeDisplayName, isCRLF ) ) {
01168     return false;
01169   }
01170 
01171   // get ":":
01172   eatCFWS( scursor, send, isCRLF );
01173   if ( scursor == send || *scursor != ':' ) {
01174     return false;
01175   }
01176 
01177   result.displayName = maybeDisplayName;
01178 
01179   // get obs-mbox-list (may contain empty entries):
01180   scursor++;
01181   while ( scursor != send ) {
01182     eatCFWS( scursor, send, isCRLF );
01183     if ( scursor == send ) {
01184       return false;
01185     }
01186 
01187     // empty entry:
01188     if ( *scursor == ',' ) {
01189       scursor++;
01190       continue;
01191     }
01192 
01193     // empty entry ending the list:
01194     if ( *scursor == ';' ) {
01195       scursor++;
01196       return true;
01197     }
01198 
01199     Mailbox maybeMailbox;
01200     if ( !parseMailbox( scursor, send, maybeMailbox, isCRLF ) ) {
01201       return false;
01202     }
01203     result.mailboxList.append( maybeMailbox );
01204 
01205     eatCFWS( scursor, send, isCRLF );
01206     // premature end:
01207     if ( scursor == send ) {
01208       return false;
01209     }
01210     // regular end of the list:
01211     if ( *scursor == ';' ) {
01212       scursor++;
01213       return true;
01214     }
01215     // eat regular list entry separator:
01216     if ( *scursor == ',' ) {
01217       scursor++;
01218     }
01219   }
01220   return false;
01221 }
01222 
01223 bool parseAddress( const char* &scursor, const char * const send,
01224                    Address &result, bool isCRLF )
01225 {
01226   // address       := mailbox / group
01227 
01228   eatCFWS( scursor, send, isCRLF );
01229   if ( scursor == send ) {
01230     return false;
01231   }
01232 
01233   // first try if it's a single mailbox:
01234   Mailbox maybeMailbox;
01235   const char * oldscursor = scursor;
01236   if ( parseMailbox( scursor, send, maybeMailbox, isCRLF ) ) {
01237     // yes, it is:
01238     result.displayName.clear();
01239     result.mailboxList.append( maybeMailbox );
01240     return true;
01241   }
01242   scursor = oldscursor;
01243 
01244   Address maybeAddress;
01245 
01246   // no, it's not a single mailbox. Try if it's a group:
01247   if ( !parseGroup( scursor, send, maybeAddress, isCRLF ) ) {
01248     return false;
01249   }
01250 
01251   result = maybeAddress;
01252   return true;
01253 }
01254 
01255 bool parseAddressList( const char* &scursor, const char * const send,
01256                        AddressList &result, bool isCRLF )
01257 {
01258   while ( scursor != send ) {
01259     eatCFWS( scursor, send, isCRLF );
01260     // end of header: this is OK.
01261     if ( scursor == send ) {
01262       return true;
01263     }
01264     // empty entry: ignore:
01265     if ( *scursor == ',' ) {
01266       scursor++;
01267       continue;
01268     }
01269     // broken clients might use ';' as list delimiter, accept that as well
01270     if ( *scursor == ';' ) {
01271       scursor++;
01272       continue;
01273     }
01274 
01275     // parse one entry
01276     Address maybeAddress;
01277     if ( !parseAddress( scursor, send, maybeAddress, isCRLF ) ) {
01278       return false;
01279     }
01280     result.append( maybeAddress );
01281 
01282     eatCFWS( scursor, send, isCRLF );
01283     // end of header: this is OK.
01284     if ( scursor == send ) {
01285       return true;
01286     }
01287     // comma separating entries: eat it.
01288     if ( *scursor == ',' ) {
01289       scursor++;
01290     }
01291   }
01292   return true;
01293 }
01294 
01295 static QString asterisk = QString::fromLatin1( "*0*", 1 );
01296 static QString asteriskZero = QString::fromLatin1( "*0*", 2 );
01297 //static QString asteriskZeroAsterisk = QString::fromLatin1( "*0*", 3 );
01298 
01299 bool parseParameter( const char* &scursor, const char * const send,
01300                      QPair<QString,QStringOrQPair> &result, bool isCRLF )
01301 {
01302   // parameter = regular-parameter / extended-parameter
01303   // regular-parameter = regular-parameter-name "=" value
01304   // extended-parameter =
01305   // value = token / quoted-string
01306   //
01307   // note that rfc2231 handling is out of the scope of this function.
01308   // Therefore we return the attribute as QString and the value as
01309   // (start,length) tupel if we see that the value is encoded
01310   // (trailing asterisk), for parseParameterList to decode...
01311 
01312   eatCFWS( scursor, send, isCRLF );
01313   if ( scursor == send ) {
01314     return false;
01315   }
01316 
01317   //
01318   // parse the parameter name:
01319   //
01320   QString maybeAttribute;
01321   if ( !parseToken( scursor, send, maybeAttribute, false /* no 8bit */ ) ) {
01322     return false;
01323   }
01324 
01325   eatCFWS( scursor, send, isCRLF );
01326   // premature end: not OK (haven't seen '=' yet).
01327   if ( scursor == send || *scursor != '=' ) {
01328     return false;
01329   }
01330   scursor++; // eat '='
01331 
01332   eatCFWS( scursor, send, isCRLF );
01333   if ( scursor == send ) {
01334     // don't choke on attribute=, meaning the value was omitted:
01335     if ( maybeAttribute.endsWith( asterisk ) ) {
01336       KMIME_WARN << "attribute ends with \"*\", but value is empty!"
01337         "Chopping away \"*\".";
01338       maybeAttribute.truncate( maybeAttribute.length() - 1 );
01339     }
01340     result = qMakePair( maybeAttribute.toLower(), QStringOrQPair() );
01341     return true;
01342   }
01343 
01344   const char * oldscursor = scursor;
01345 
01346   //
01347   // parse the parameter value:
01348   //
01349   QStringOrQPair maybeValue;
01350   if ( *scursor == '"' ) {
01351     // value is a quoted-string:
01352     scursor++;
01353     if ( maybeAttribute.endsWith( asterisk ) ) {
01354       // attributes ending with "*" designate extended-parameters,
01355       // which cannot have quoted-strings as values. So we remove the
01356       // trailing "*" to not confuse upper layers.
01357       KMIME_WARN << "attribute ends with \"*\", but value is a quoted-string!"
01358         "Chopping away \"*\".";
01359       maybeAttribute.truncate( maybeAttribute.length() - 1 );
01360     }
01361 
01362     if ( !parseGenericQuotedString( scursor, send, maybeValue.qstring, isCRLF ) ) {
01363       scursor = oldscursor;
01364       result = qMakePair( maybeAttribute.toLower(), QStringOrQPair() );
01365       return false; // this case needs further processing by upper layers!!
01366     }
01367   } else {
01368     // value is a token:
01369     if ( !parseToken( scursor, send, maybeValue.qpair, false /* no 8bit */ ) ) {
01370       scursor = oldscursor;
01371       result = qMakePair( maybeAttribute.toLower(), QStringOrQPair() );
01372       return false; // this case needs further processing by upper layers!!
01373     }
01374   }
01375 
01376   result = qMakePair( maybeAttribute.toLower(), maybeValue );
01377   return true;
01378 }
01379 
01380 bool parseRawParameterList( const char* &scursor, const char * const send,
01381                             QMap<QString,QStringOrQPair> &result,
01382                             bool isCRLF )
01383 {
01384   // we use parseParameter() consecutively to obtain a map of raw
01385   // attributes to raw values. "Raw" here means that we don't do
01386   // rfc2231 decoding and concatenation. This is left to
01387   // parseParameterList(), which will call this function.
01388   //
01389   // The main reason for making this chunk of code a separate
01390   // (private) method is that we can deal with broken parameters
01391   // _here_ and leave the rfc2231 handling solely to
01392   // parseParameterList(), which will still be enough work.
01393 
01394   while ( scursor != send ) {
01395     eatCFWS( scursor, send, isCRLF );
01396     // empty entry ending the list: OK.
01397     if ( scursor == send ) {
01398       return true;
01399     }
01400     // empty list entry: ignore.
01401     if ( *scursor == ';' ) {
01402       scursor++;
01403       continue;
01404     }
01405 
01406     QPair<QString,QStringOrQPair> maybeParameter;
01407     if ( !parseParameter( scursor, send, maybeParameter, isCRLF ) ) {
01408       // we need to do a bit of work if the attribute is not
01409       // NULL. These are the cases marked with "needs further
01410       // processing" in parseParameter(). Specifically, parsing of the
01411       // token or the quoted-string, which should represent the value,
01412       // failed. We take the easy way out and simply search for the
01413       // next ';' to start parsing again. (Another option would be to
01414       // take the text between '=' and ';' as value)
01415       if ( maybeParameter.first.isNull() ) {
01416         return false;
01417       }
01418       while ( scursor != send ) {
01419         if ( *scursor++ == ';' ) {
01420           goto IS_SEMICOLON;
01421         }
01422       }
01423       // scursor == send case: end of list.
01424       return true;
01425     IS_SEMICOLON:
01426       // *scursor == ';' case: parse next entry.
01427       continue;
01428     }
01429     // successful parsing brings us here:
01430     result.insert( maybeParameter.first, maybeParameter.second );
01431 
01432     eatCFWS( scursor, send, isCRLF );
01433     // end of header: ends list.
01434     if ( scursor == send ) {
01435       return true;
01436     }
01437     // regular separator: eat it.
01438     if ( *scursor == ';' ) {
01439       scursor++;
01440     }
01441   }
01442   return true;
01443 }
01444 
01445 static void decodeRFC2231Value( Codec* &rfc2231Codec,
01446                                 QTextCodec* &textcodec,
01447                                 bool isContinuation, QString &value,
01448                                 QPair<const char*,int> &source )
01449 {
01450   //
01451   // parse the raw value into (charset,language,text):
01452   //
01453 
01454   const char * decBegin = source.first;
01455   const char * decCursor = decBegin;
01456   const char * decEnd = decCursor + source.second;
01457 
01458   if ( !isContinuation ) {
01459     // find the first single quote
01460     while ( decCursor != decEnd ) {
01461       if ( *decCursor == '\'' ) {
01462         break;
01463       } else {
01464         decCursor++;
01465       }
01466     }
01467 
01468     if ( decCursor == decEnd ) {
01469       // there wasn't a single single quote at all!
01470       // take the whole value to be in latin-1:
01471       KMIME_WARN << "No charset in extended-initial-value."
01472         "Assuming \"iso-8859-1\".";
01473       value += QString::fromLatin1( decBegin, source.second );
01474       return;
01475     }
01476 
01477     QByteArray charset( decBegin, decCursor - decBegin );
01478 
01479     const char * oldDecCursor = ++decCursor;
01480     // find the second single quote (we ignore the language tag):
01481     while ( decCursor != decEnd ) {
01482       if ( *decCursor == '\'' ) {
01483         break;
01484       } else {
01485         decCursor++;
01486       }
01487     }
01488     if ( decCursor == decEnd ) {
01489       KMIME_WARN << "No language in extended-initial-value."
01490         "Trying to recover.";
01491       decCursor = oldDecCursor;
01492     } else {
01493       decCursor++;
01494     }
01495 
01496     // decCursor now points to the start of the
01497     // "extended-other-values":
01498 
01499     //
01500     // get the decoders:
01501     //
01502 
01503     bool matchOK = false;
01504     textcodec = KGlobal::charsets()->codecForName( charset, matchOK );
01505     if ( !matchOK ) {
01506       textcodec = 0;
01507       KMIME_WARN_UNKNOWN( Charset, charset );
01508     }
01509   }
01510 
01511   if ( !rfc2231Codec ) {
01512     rfc2231Codec = Codec::codecForName("x-kmime-rfc2231");
01513     assert( rfc2231Codec );
01514   }
01515 
01516   if ( !textcodec ) {
01517     value += QString::fromLatin1( decCursor, decEnd - decCursor );
01518     return;
01519   }
01520 
01521   Decoder * dec = rfc2231Codec->makeDecoder();
01522   assert( dec );
01523 
01524   //
01525   // do the decoding:
01526   //
01527 
01528   QByteArray buffer;
01529   buffer.resize( rfc2231Codec->maxDecodedSizeFor( decEnd - decCursor ) );
01530   QByteArray::Iterator bit = buffer.begin();
01531   QByteArray::ConstIterator bend = buffer.end();
01532 
01533   if ( !dec->decode( decCursor, decEnd, bit, bend ) ) {
01534     KMIME_WARN << rfc2231Codec->name()
01535                << "codec lies about its maxDecodedSizeFor()" << endl
01536                << "result may be truncated";
01537   }
01538 
01539   value += textcodec->toUnicode( buffer.begin(), bit - buffer.begin() );
01540 
01541   // kDebug(5320) << "value now: \"" << value << "\"";
01542   // cleanup:
01543   delete dec;
01544 }
01545 
01546 // known issues:
01547 //  - permutes rfc2231 continuations when the total number of parts
01548 //    exceeds 10 (other-sections then becomes *xy, ie. two digits)
01549 
01550 bool parseParameterList( const char* &scursor, const char * const send,
01551                          QMap<QString,QString> &result, bool isCRLF )
01552 {
01553   // parse the list into raw attribute-value pairs:
01554   QMap<QString,QStringOrQPair> rawParameterList;
01555   if (!parseRawParameterList( scursor, send, rawParameterList, isCRLF ) ) {
01556     return false;
01557   }
01558 
01559   if ( rawParameterList.isEmpty() ) {
01560     return true;
01561   }
01562 
01563   // decode rfc 2231 continuations and alternate charset encoding:
01564 
01565   // NOTE: this code assumes that what QMapIterator delivers is sorted
01566   // by the key!
01567 
01568   Codec * rfc2231Codec = 0;
01569   QTextCodec * textcodec = 0;
01570   QString attribute;
01571   QString value;
01572   enum Modes {
01573     NoMode = 0x0, Continued = 0x1, Encoded = 0x2
01574   } mode;
01575 
01576   QMap<QString,QStringOrQPair>::Iterator it, end = rawParameterList.end();
01577 
01578   for ( it = rawParameterList.begin() ; it != end ; ++it ) {
01579     if ( attribute.isNull() || !it.key().startsWith( attribute ) ) {
01580       //
01581       // new attribute:
01582       //
01583 
01584       // store the last attribute/value pair in the result map now:
01585       if ( !attribute.isNull() ) {
01586         result.insert( attribute, value );
01587       }
01588       // and extract the information from the new raw attribute:
01589       value.clear();
01590       attribute = it.key();
01591       mode = NoMode;
01592       // is the value encoded?
01593       if ( attribute.endsWith( asterisk ) ) {
01594         attribute.truncate( attribute.length() - 1 );
01595         mode = (Modes) ((int) mode | Encoded);
01596       }
01597       // is the value continued?
01598       if ( attribute.endsWith( asteriskZero ) ) {
01599         attribute.truncate( attribute.length() - 2 );
01600         mode = (Modes) ((int) mode | Continued);
01601       }
01602       //
01603       // decode if necessary:
01604       //
01605       if ( mode & Encoded ) {
01606         decodeRFC2231Value( rfc2231Codec, textcodec,
01607                             false, /* isn't continuation */
01608                             value, (*it).qpair );
01609       } else {
01610         // not encoded.
01611         if ( (*it).qpair.first ) {
01612           value += QString::fromLatin1( (*it).qpair.first, (*it).qpair.second );
01613         } else {
01614           value += (*it).qstring;
01615         }
01616       }
01617 
01618       //
01619       // shortcut-processing when the value isn't encoded:
01620       //
01621 
01622       if ( !(mode & Continued) ) {
01623         // save result already:
01624         result.insert( attribute, value );
01625         // force begin of a new attribute:
01626         attribute.clear();
01627       }
01628     } else { // it.key().startsWith( attribute )
01629       //
01630       // continuation
01631       //
01632 
01633       // ignore the section and trust QMap to have sorted the keys:
01634       if ( it.key().endsWith( asterisk ) ) {
01635         // encoded
01636         decodeRFC2231Value( rfc2231Codec, textcodec,
01637                             true, /* is continuation */
01638                             value, (*it).qpair );
01639       } else {
01640         // not encoded
01641         if ( (*it).qpair.first ) {
01642           value += QString::fromLatin1( (*it).qpair.first, (*it).qpair.second );
01643         } else {
01644           value += (*it).qstring;
01645         }
01646       }
01647     }
01648   }
01649 
01650   // write last attr/value pair:
01651   if ( !attribute.isNull() ) {
01652     result.insert( attribute, value );
01653   }
01654 
01655   return true;
01656 }
01657 
01658 static const char * stdDayNames[] = {
01659   "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
01660 };
01661 static const int stdDayNamesLen = sizeof stdDayNames / sizeof *stdDayNames;
01662 
01663 static bool parseDayName( const char* &scursor, const char * const send )
01664 {
01665   // check bounds:
01666   if ( send - scursor < 3 ) {
01667     return false;
01668   }
01669 
01670   for ( int i = 0 ; i < stdDayNamesLen ; ++i ) {
01671     if ( qstrnicmp( scursor, stdDayNames[i], 3 ) == 0 ) {
01672       scursor += 3;
01673       // kDebug(5320) << "found" << stdDayNames[i];
01674       return true;
01675     }
01676   }
01677 
01678   return false;
01679 }
01680 
01681 static const char * stdMonthNames[] = {
01682   "Jan", "Feb", "Mar", "Apr", "May", "Jun",
01683   "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
01684 };
01685 static const int stdMonthNamesLen =
01686                               sizeof stdMonthNames / sizeof *stdMonthNames;
01687 
01688 static bool parseMonthName( const char* &scursor, const char * const send,
01689                             int &result )
01690 {
01691   // check bounds:
01692   if ( send - scursor < 3 ) {
01693     return false;
01694   }
01695 
01696   for ( result = 0 ; result < stdMonthNamesLen ; ++result ) {
01697     if ( qstrnicmp( scursor, stdMonthNames[result], 3 ) == 0 ) {
01698       scursor += 3;
01699       return true;
01700     }
01701   }
01702 
01703   // not found:
01704   return false;
01705 }
01706 
01707 static const struct {
01708   const char * tzName;
01709   long int secsEastOfGMT;
01710 } timeZones[] = {
01711   // rfc 822 timezones:
01712   { "GMT", 0 },
01713   { "UT", 0 },
01714   { "EDT", -4*3600 },
01715   { "EST", -5*3600 },
01716   { "MST", -5*3600 },
01717   { "CST", -6*3600 },
01718   { "MDT", -6*3600 },
01719   { "MST", -7*3600 },
01720   { "PDT", -7*3600 },
01721   { "PST", -8*3600 },
01722   // common, non-rfc-822 zones:
01723   { "CET", 1*3600 },
01724   { "MET", 1*3600 },
01725   { "UTC", 0 },
01726   { "CEST", 2*3600 },
01727   { "BST", 1*3600 },
01728   // rfc 822 military timezones:
01729   { "Z", 0 },
01730   { "A", -1*3600 },
01731   { "B", -2*3600 },
01732   { "C", -3*3600 },
01733   { "D", -4*3600 },
01734   { "E", -5*3600 },
01735   { "F", -6*3600 },
01736   { "G", -7*3600 },
01737   { "H", -8*3600 },
01738   { "I", -9*3600 },
01739   // J is not used!
01740   { "K", -10*3600 },
01741   { "L", -11*3600 },
01742   { "M", -12*3600 },
01743   { "N", 1*3600 },
01744   { "O", 2*3600 },
01745   { "P", 3*3600 },
01746   { "Q", 4*3600 },
01747   { "R", 5*3600 },
01748   { "S", 6*3600 },
01749   { "T", 7*3600 },
01750   { "U", 8*3600 },
01751   { "V", 9*3600 },
01752   { "W", 10*3600 },
01753   { "X", 11*3600 },
01754   { "Y", 12*3600 },
01755 };
01756 static const int timeZonesLen = sizeof timeZones / sizeof *timeZones;
01757 
01758 static bool parseAlphaNumericTimeZone( const char* &scursor,
01759                                        const char * const send,
01760                                        long int &secsEastOfGMT,
01761                                        bool &timeZoneKnown )
01762 {
01763   QPair<const char*,int> maybeTimeZone( 0, 0 );
01764   if ( !parseToken( scursor, send, maybeTimeZone, false /*no 8bit*/ ) ) {
01765     return false;
01766   }
01767   for ( int i = 0 ; i < timeZonesLen ; ++i ) {
01768     if ( qstrnicmp( timeZones[i].tzName,
01769                     maybeTimeZone.first, maybeTimeZone.second ) == 0 ) {
01770       scursor += maybeTimeZone.second;
01771       secsEastOfGMT = timeZones[i].secsEastOfGMT;
01772       timeZoneKnown = true;
01773       return true;
01774     }
01775   }
01776 
01777   // don't choke just because we don't happen to know the time zone
01778   KMIME_WARN_UNKNOWN( time zone,
01779                       QByteArray( maybeTimeZone.first, maybeTimeZone.second ) );
01780   secsEastOfGMT = 0;
01781   timeZoneKnown = false;
01782   return true;
01783 }
01784 
01785 // parse a number and return the number of digits parsed:
01786 int parseDigits( const char* &scursor, const char * const send, int &result )
01787 {
01788   result = 0;
01789   int digits = 0;
01790   for ( ; scursor != send && isdigit( *scursor ) ; scursor++, digits++ ) {
01791     result *= 10;
01792     result += int( *scursor - '0' );
01793   }
01794   return digits;
01795 }
01796 
01797 static bool parseTimeOfDay( const char* &scursor, const char * const send,
01798                             int &hour, int &min, int &sec, bool isCRLF=false )
01799 {
01800   // time-of-day := 2DIGIT [CFWS] ":" [CFWS] 2DIGIT [ [CFWS] ":" 2DIGIT ]
01801 
01802   //
01803   // 2DIGIT representing "hour":
01804   //
01805   if ( !parseDigits( scursor, send, hour ) ) {
01806     return false;
01807   }
01808 
01809   eatCFWS( scursor, send, isCRLF );
01810   if ( scursor == send || *scursor != ':' ) {
01811     return false;
01812   }
01813   scursor++; // eat ':'
01814 
01815   eatCFWS( scursor, send, isCRLF );
01816   if ( scursor == send ) {
01817     return false;
01818   }
01819 
01820   //
01821   // 2DIGIT representing "minute":
01822   //
01823   if ( !parseDigits( scursor, send, min ) ) {
01824     return false;
01825   }
01826 
01827   eatCFWS( scursor, send, isCRLF );
01828   if ( scursor == send ) {
01829     return true; // seconds are optional
01830   }
01831 
01832   //
01833   // let's see if we have a 2DIGIT representing "second":
01834   //
01835   if ( *scursor == ':' ) {
01836     // yepp, there are seconds:
01837     scursor++; // eat ':'
01838     eatCFWS( scursor, send, isCRLF );
01839     if ( scursor == send ) {
01840       return false;
01841     }
01842 
01843     if ( !parseDigits( scursor, send, sec ) ) {
01844       return false;
01845     }
01846   } else {
01847     sec = 0;
01848   }
01849 
01850   return true;
01851 }
01852 
01853 bool parseTime( const char* &scursor, const char * send,
01854                 int &hour, int &min, int &sec, long int &secsEastOfGMT,
01855                 bool &timeZoneKnown, bool isCRLF )
01856 {
01857   // time := time-of-day CFWS ( zone / obs-zone )
01858   //
01859   // obs-zone    := "UT" / "GMT" /
01860   //                "EST" / "EDT" / ; -0500 / -0400
01861   //                "CST" / "CDT" / ; -0600 / -0500
01862   //                "MST" / "MDT" / ; -0700 / -0600
01863   //                "PST" / "PDT" / ; -0800 / -0700
01864   //                "A"-"I" / "a"-"i" /
01865   //                "K"-"Z" / "k"-"z"
01866 
01867   eatCFWS( scursor, send, isCRLF );
01868   if ( scursor == send ) {
01869     return false;
01870   }
01871 
01872   if ( !parseTimeOfDay( scursor, send, hour, min, sec, isCRLF ) ) {
01873     return false;
01874   }
01875 
01876   eatCFWS( scursor, send, isCRLF );
01877   if ( scursor == send ) {
01878     timeZoneKnown = false;
01879     secsEastOfGMT = 0;
01880     return true; // allow missing timezone
01881   }
01882 
01883   timeZoneKnown = true;
01884   if ( *scursor == '+' || *scursor == '-' ) {
01885     // remember and eat '-'/'+':
01886     const char sign = *scursor++;
01887     // numerical timezone:
01888     int maybeTimeZone;
01889     if ( parseDigits( scursor, send, maybeTimeZone ) != 4 ) {
01890       return false;
01891     }
01892     secsEastOfGMT = 60 * ( maybeTimeZone / 100 * 60 + maybeTimeZone % 100 );
01893     if ( sign == '-' ) {
01894       secsEastOfGMT *= -1;
01895       if ( secsEastOfGMT == 0 ) {
01896         timeZoneKnown = false; // -0000 means indetermined tz
01897       }
01898     }
01899   } else {
01900     // maybe alphanumeric timezone:
01901     if ( !parseAlphaNumericTimeZone( scursor, send, secsEastOfGMT, timeZoneKnown ) ) {
01902       return false;
01903     }
01904   }
01905   return true;
01906 }
01907 
01908 bool parseDateTime( const char* &scursor, const char * const send,
01909                     KDateTime &result, bool isCRLF )
01910 {
01911   // Parsing date-time; strict mode:
01912   //
01913   // date-time   := [ [CFWS] day-name [CFWS] "," ]                      ; wday
01914   // (expanded)     [CFWS] 1*2DIGIT CFWS month-name CFWS 2*DIGIT [CFWS] ; date
01915   //                time
01916   //
01917   // day-name    := "Mon" / "Tue" / "Wed" / "Thu" / "Fri" / "Sat" / "Sun"
01918   // month-name  := "Jan" / "Feb" / "Mar" / "Apr" / "May" / "Jun" /
01919   //                "Jul" / "Aug" / "Sep" / "Oct" / "Nov" / "Dec"
01920 
01921   result = KDateTime();
01922   QDateTime maybeDateTime;
01923 
01924   eatCFWS( scursor, send, isCRLF );
01925   if ( scursor == send ) {
01926     return false;
01927   }
01928 
01929   //
01930   // let's see if there's a day-of-week:
01931   //
01932   if ( parseDayName( scursor, send ) ) {
01933     eatCFWS( scursor, send, isCRLF );
01934     if ( scursor == send ) {
01935       return false;
01936     }
01937     // day-name should be followed by ',' but we treat it as optional:
01938     if ( *scursor == ',' ) {
01939       scursor++; // eat ','
01940       eatCFWS( scursor, send, isCRLF );
01941     }
01942   }
01943 
01944   //
01945   // 1*2DIGIT representing "day" (of month):
01946   //
01947   int maybeDay;
01948   if ( !parseDigits( scursor, send, maybeDay ) ) {
01949     return false;
01950   }
01951 
01952   eatCFWS( scursor, send, isCRLF );
01953   if ( scursor == send ) {
01954     return false;
01955   }
01956 
01957   //
01958   // month-name:
01959   //
01960   int maybeMonth = 0;
01961   if ( !parseMonthName( scursor, send, maybeMonth ) ) {
01962     return false;
01963   }
01964   if ( scursor == send ) {
01965     return false;
01966   }
01967   assert( maybeMonth >= 0 ); assert( maybeMonth <= 11 );
01968   ++maybeMonth; // 0-11 -> 1-12
01969 
01970   eatCFWS( scursor, send, isCRLF );
01971   if ( scursor == send ) {
01972     return false;
01973   }
01974 
01975   //
01976   // 2*DIGIT representing "year":
01977   //
01978   int maybeYear;
01979   if ( !parseDigits( scursor, send, maybeYear ) ) {
01980     return false;
01981   }
01982   // RFC 2822 4.3 processing:
01983   if ( maybeYear < 50 ) {
01984     maybeYear += 2000;
01985   } else if ( maybeYear < 1000 ) {
01986     maybeYear += 1900;
01987   }
01988   // else keep as is
01989   if ( maybeYear < 1900 ) {
01990     return false; // rfc2822, 3.3
01991   }
01992 
01993   eatCFWS( scursor, send, isCRLF );
01994   if ( scursor == send ) {
01995     return false;
01996   }
01997 
01998   maybeDateTime.setDate( QDate( maybeYear, maybeMonth, maybeDay ) );
01999 
02000   //
02001   // time
02002   //
02003   int maybeHour, maybeMinute, maybeSecond;
02004   long int secsEastOfGMT;
02005   bool timeZoneKnown = true;
02006 
02007   if ( !parseTime( scursor, send,
02008                    maybeHour, maybeMinute, maybeSecond,
02009                    secsEastOfGMT, timeZoneKnown, isCRLF ) ) {
02010     return false;
02011   }
02012 
02013   maybeDateTime.setTime( QTime( maybeHour, maybeMinute, maybeSecond ) );
02014   if ( !maybeDateTime.isValid() )
02015     return false;
02016 
02017   result = KDateTime( maybeDateTime, KDateTime::Spec( KDateTime::OffsetFromUTC, secsEastOfGMT ) );
02018   if ( !result.isValid() )
02019     return false;
02020   return true;
02021 }
02022 
02023 } // namespace HeaderParsing
02024 
02025 } // namespace KMime

KMIME Library

Skip menu "KMIME Library"
  • Main Page
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

KDE-PIM Libraries

Skip menu "KDE-PIM Libraries"
  • akonadi
  • kabc
  • kblog
  • kcal
  • kimap
  • kioslave
  •   imap4
  •   mbox
  • kldap
  • kmime
  • kpimidentities
  •   richtextbuilders
  • kpimutils
  • kresources
  • ktnef
  • kxmlrpcclient
  • mailtransport
  • qgpgme
  • syndication
  •   atom
  •   rdf
  •   rss2
Generated for KDE-PIM Libraries by doxygen 1.5.7.1
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal