• Skip to content
  • Skip to link menu
KDE 4.1 API Reference
  • KDE API Reference
  • KDE-PIM Libraries
  • Sitemap
  • Contact Us
 

KMIME Library

kmime_header_parsing.cpp

00001 /*  -*- c++ -*-
00002     kmime_header_parsing.cpp
00003 
00004     KMime, the KDE internet mail/usenet news message library.
00005     Copyright (c) 2001-2002 Marc Mutz <mutz@kde.org>
00006 
00007     This library is free software; you can redistribute it and/or
00008     modify it under the terms of the GNU Library General Public
00009     License as published by the Free Software Foundation; either
00010     version 2 of the License, or (at your option) any later version.
00011 
00012     This library is distributed in the hope that it will be useful,
00013     but WITHOUT ANY WARRANTY; without even the implied warranty of
00014     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015     Library General Public License for more details.
00016 
00017     You should have received a copy of the GNU Library General Public License
00018     along with this library; see the file COPYING.LIB.  If not, write to
00019     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00020     Boston, MA 02110-1301, USA.
00021 */
00022 
00023 #include "kmime_header_parsing.h"
00024 
00025 #include "kmime_codecs.h"
00026 #include "kmime_util.h"
00027 #include "kmime_dateformatter.h"
00028 #include "kmime_warning.h"
00029 
00030 #include <kglobal.h>
00031 #include <kcharsets.h>
00032 
00033 #include <QtCore/QTextCodec>
00034 #include <QtCore/QMap>
00035 #include <QtCore/QStringList>
00036 #include <QtCore/QUrl>
00037 
00038 #include <ctype.h> // for isdigit
00039 #include <cassert>
00040 
00041 using namespace KMime;
00042 using namespace KMime::Types;
00043 
00044 namespace KMime {
00045 
00046 namespace Types {
00047 
00048 static QString addr_spec_as_string( const AddrSpec & as, bool pretty )
00049 {
00050   bool needsQuotes = false;
00051   QString result;
00052   if ( as.isEmpty() )
00053       return QString();
00054   for ( int i = 0 ; i < as.localPart.length() ; ++i ) {
00055     const char ch = as.localPart[i].toLatin1();
00056     if ( ch == '.' || isAText( ch ) ) {
00057       result += ch;
00058     } else {
00059       needsQuotes = true;
00060       if ( ch == '\\' || ch == '"' ) {
00061         result += '\\';
00062       }
00063       result += ch;
00064     }
00065   }
00066   const QString dom = pretty ? QUrl::fromAce( as.domain.toLatin1() ) : as.domain ;
00067   if ( needsQuotes ) {
00068     return '"' + result + "\"@" + dom;
00069   } else {
00070     return result + '@' + dom;
00071   }
00072 }
00073 
00074 QString AddrSpec::asString() const
00075 {
00076     return addr_spec_as_string( *this, false );
00077 }
00078 
00079 QString AddrSpec::asPrettyString() const
00080 {
00081     return addr_spec_as_string( *this, true );
00082 }
00083 
00084 bool AddrSpec::isEmpty() const
00085 {
00086   return localPart.isEmpty() && domain.isEmpty();
00087 }
00088 
00089 QByteArray Mailbox::address() const
00090 {
00091   return mAddrSpec.asString().toLatin1();
00092 }
00093 
00094 AddrSpec Mailbox::addrSpec() const
00095 {
00096   return mAddrSpec;
00097 }
00098 
00099 QString Mailbox::name() const
00100 {
00101   return mDisplayName;
00102 }
00103 
00104 void Mailbox::setAddress( const AddrSpec &addr )
00105 {
00106   mAddrSpec = addr;
00107 }
00108 
00109 void Mailbox::setAddress( const QByteArray &addr )
00110 {
00111   const char *cursor = addr.constData();
00112   if ( !HeaderParsing::parseAngleAddr( cursor,
00113                                        cursor + addr.length(), mAddrSpec ) ) {
00114     if ( !HeaderParsing::parseAddrSpec( cursor, cursor + addr.length(),
00115                                         mAddrSpec ) ) {
00116       kWarning() << "Invalid address";
00117       return;
00118     }
00119   }
00120 }
00121 
00122 void Mailbox::setName( const QString &name )
00123 {
00124   mDisplayName = name;
00125 }
00126 
00127 void Mailbox::setNameFrom7Bit( const QByteArray &name,
00128                                const QByteArray &defaultCharset )
00129 {
00130   QByteArray cs;
00131   mDisplayName = decodeRFC2047String( name, cs, defaultCharset, false );
00132 }
00133 
00134 bool Mailbox::hasAddress() const
00135 {
00136   return !mAddrSpec.isEmpty();
00137 }
00138 
00139 bool Mailbox::hasName() const
00140 {
00141   return !mDisplayName.isEmpty();
00142 }
00143 
00144 QString Mailbox::prettyAddress() const
00145 {
00146   if ( !hasName() ) {
00147     return address();
00148   }
00149   QString s = name();
00150   if ( hasAddress() ) {
00151     s += QLatin1String(" <") + address() + QLatin1Char('>');
00152   }
00153   return s;
00154 }
00155 
00156 void Mailbox::fromUnicodeString( const QString &s )
00157 {
00158   from7BitString( encodeRFC2047String( s, "utf-8", false ) );
00159 }
00160 
00161 void Mailbox::from7BitString( const QByteArray &s )
00162 {
00163   const char *cursor = s.constData();
00164   HeaderParsing::parseMailbox( cursor, cursor + s.length(), *this );
00165 }
00166 
00167 QByteArray KMime::Types::Mailbox::as7BitString( const QByteArray &encCharset ) const
00168 {
00169   if ( !hasName() ) {
00170     return address();
00171   }
00172   QByteArray rv;
00173   if ( isUsAscii( name() ) ) {
00174     QByteArray tmp = name().toLatin1();
00175     addQuotes( tmp, false );
00176     rv += tmp;
00177   } else {
00178     rv += encodeRFC2047String( name(), encCharset, true );
00179   }
00180   if ( hasAddress() ) {
00181     rv += " <" + address() + '>';
00182   }
00183   return rv;
00184 }
00185 
00186 } // namespace Types
00187 
00188 namespace HeaderParsing {
00189 
00190 // parse the encoded-word (scursor points to after the initial '=')
00191 bool parseEncodedWord( const char* &scursor, const char * const send,
00192                        QString &result, QByteArray &language,
00193                        QByteArray &usedCS, const QByteArray &defaultCS,
00194                        bool forceCS )
00195 {
00196   // make sure the caller already did a bit of the work.
00197   assert( *(scursor-1) == '=' );
00198 
00199   //
00200   // STEP 1:
00201   // scan for the charset/language portion of the encoded-word
00202   //
00203 
00204   char ch = *scursor++;
00205 
00206   if ( ch != '?' ) {
00207     kDebug(5320) << "first";
00208     KMIME_WARN_PREMATURE_END_OF( EncodedWord );
00209     return false;
00210   }
00211 
00212   // remember start of charset (ie. just after the initial "=?") and
00213   // language (just after the first '*') fields:
00214   const char * charsetStart = scursor;
00215   const char * languageStart = 0;
00216 
00217   // find delimiting '?' (and the '*' separating charset and language
00218   // tags, if any):
00219   for ( ; scursor != send ; scursor++ ) {
00220     if ( *scursor == '?') {
00221       break;
00222     } else if ( *scursor == '*' && languageStart == 0 ) {
00223       languageStart = scursor + 1;
00224     }
00225   }
00226 
00227   // not found? can't be an encoded-word!
00228   if ( scursor == send || *scursor != '?' ) {
00229     kDebug(5320) << "second";
00230     KMIME_WARN_PREMATURE_END_OF( EncodedWord );
00231     return false;
00232   }
00233 
00234   // extract the language information, if any (if languageStart is 0,
00235   // language will be null, too):
00236   QByteArray maybeLanguage( languageStart, scursor - languageStart );
00237   // extract charset information (keep in mind: the size given to the
00238   // ctor is one off due to the \0 terminator):
00239   QByteArray maybeCharset( charsetStart,
00240                            ( languageStart ? languageStart - 1 : scursor ) - charsetStart );
00241 
00242   //
00243   // STEP 2:
00244   // scan for the encoding portion of the encoded-word
00245   //
00246 
00247   // remember start of encoding (just _after_ the second '?'):
00248   scursor++;
00249   const char * encodingStart = scursor;
00250 
00251   // find next '?' (ending the encoding tag):
00252   for ( ; scursor != send ; scursor++ ) {
00253     if ( *scursor == '?' ) {
00254       break;
00255     }
00256   }
00257 
00258   // not found? Can't be an encoded-word!
00259   if ( scursor == send || *scursor != '?' ) {
00260     kDebug(5320) << "third";
00261     KMIME_WARN_PREMATURE_END_OF( EncodedWord );
00262     return false;
00263   }
00264 
00265   // extract the encoding information:
00266   QByteArray maybeEncoding( encodingStart, scursor - encodingStart );
00267 
00268   kDebug(5320) << "parseEncodedWord: found charset == \"" << maybeCharset
00269            << "\"; language == \"" << maybeLanguage
00270            << "\"; encoding == \"" << maybeEncoding << "\"";
00271 
00272   //
00273   // STEP 3:
00274   // scan for encoded-text portion of encoded-word
00275   //
00276 
00277   // remember start of encoded-text (just after the third '?'):
00278   scursor++;
00279   const char * encodedTextStart = scursor;
00280 
00281   // find next '?' (ending the encoded-text):
00282   for ( ; scursor != send ; scursor++ ) {
00283     if ( *scursor == '?' ) {
00284       break;
00285     }
00286   }
00287 
00288   // not found? Can't be an encoded-word!
00289   // ### maybe evaluate it nonetheless if the rest is OK?
00290   if ( scursor == send || *scursor != '?' ) {
00291     kDebug(5320) << "fourth";
00292     KMIME_WARN_PREMATURE_END_OF( EncodedWord );
00293     return false;
00294   }
00295   scursor++;
00296   // check for trailing '=':
00297   if ( scursor == send || *scursor != '=' ) {
00298     kDebug(5320) << "fifth";
00299     KMIME_WARN_PREMATURE_END_OF( EncodedWord );
00300     return false;
00301   }
00302   scursor++;
00303 
00304   // set end sentinel for encoded-text:
00305   const char * const encodedTextEnd = scursor - 2;
00306 
00307   //
00308   // STEP 4:
00309   // setup decoders for the transfer encoding and the charset
00310   //
00311 
00312   // try if there's a codec for the encoding found:
00313   Codec * codec = Codec::codecForName( maybeEncoding );
00314   if ( !codec ) {
00315     KMIME_WARN_UNKNOWN( Encoding, maybeEncoding );
00316     return false;
00317   }
00318 
00319   // get an instance of a corresponding decoder:
00320   Decoder * dec = codec->makeDecoder();
00321   assert( dec );
00322 
00323   // try if there's a (text)codec for the charset found:
00324   bool matchOK = false;
00325   QTextCodec *textCodec = 0;
00326   if ( forceCS || maybeCharset.isEmpty() ) {
00327     textCodec = KGlobal::charsets()->codecForName( defaultCS, matchOK );
00328     usedCS = cachedCharset( defaultCS );
00329   } else {
00330     textCodec = KGlobal::charsets()->codecForName( maybeCharset, matchOK );
00331     if ( !matchOK ) {  //no suitable codec found => use default charset
00332       textCodec = KGlobal::charsets()->codecForName( defaultCS, matchOK );
00333       usedCS = cachedCharset( defaultCS );
00334     } else {
00335       usedCS = cachedCharset( maybeCharset );
00336     }
00337   }
00338 
00339   if ( !matchOK || !textCodec ) {
00340     KMIME_WARN_UNKNOWN( Charset, maybeCharset );
00341     delete dec;
00342     return false;
00343   };
00344 
00345   kDebug(5320) << "mimeName(): \"" << textCodec->name() << "\"";
00346 
00347   // allocate a temporary buffer to store the 8bit text:
00348   int encodedTextLength = encodedTextEnd - encodedTextStart;
00349   QByteArray buffer;
00350   buffer.resize( codec->maxDecodedSizeFor( encodedTextLength ) );
00351   QByteArray::Iterator bit = buffer.begin();
00352   QByteArray::ConstIterator bend = buffer.end();
00353 
00354   //
00355   // STEP 5:
00356   // do the actual decoding
00357   //
00358 
00359   if ( !dec->decode( encodedTextStart, encodedTextEnd, bit, bend ) ) {
00360     KMIME_WARN << codec->name() << "codec lies about its maxDecodedSizeFor("
00361                << encodedTextLength << ")\nresult may be truncated";
00362   }
00363 
00364   result = textCodec->toUnicode( buffer.begin(), bit - buffer.begin() );
00365 
00366   kDebug(5320) << "result now: \"" << result << "\"";
00367   // cleanup:
00368   delete dec;
00369   language = maybeLanguage;
00370 
00371   return true;
00372 }
00373 
00374 static inline void eatWhiteSpace( const char* &scursor, const char * const send )
00375 {
00376   while ( scursor != send &&
00377           ( *scursor == ' ' || *scursor == '\n' ||
00378             *scursor == '\t' || *scursor == '\r' ) )
00379     scursor++;
00380 }
00381 
00382 bool parseAtom( const char * &scursor, const char * const send,
00383                 QString &result, bool allow8Bit )
00384 {
00385   QPair<const char*,int> maybeResult;
00386 
00387   if ( parseAtom( scursor, send, maybeResult, allow8Bit ) ) {
00388     result += QString::fromLatin1( maybeResult.first, maybeResult.second );
00389     return true;
00390   }
00391 
00392   return false;
00393 }
00394 
00395 bool parseAtom( const char * &scursor, const char * const send,
00396                 QPair<const char*,int> &result, bool allow8Bit )
00397 {
00398   bool success = false;
00399   const char *start = scursor;
00400 
00401   while ( scursor != send ) {
00402     signed char ch = *scursor++;
00403     if ( ch > 0 && isAText( ch ) ) {
00404       // AText: OK
00405       success = true;
00406     } else if ( allow8Bit && ch < 0 ) {
00407       // 8bit char: not OK, but be tolerant.
00408       KMIME_WARN_8BIT( ch );
00409       success = true;
00410     } else {
00411       // CTL or special - marking the end of the atom:
00412       // re-set sursor to point to the offending
00413       // char and return:
00414       scursor--;
00415       break;
00416     }
00417   }
00418   result.first = start;
00419   result.second = scursor - start;
00420   return success;
00421 }
00422 
00423 bool parseToken( const char * &scursor, const char * const send,
00424                  QString &result, bool allow8Bit )
00425 {
00426   QPair<const char*,int> maybeResult;
00427 
00428   if ( parseToken( scursor, send, maybeResult, allow8Bit ) ) {
00429     result += QString::fromLatin1( maybeResult.first, maybeResult.second );
00430     return true;
00431   }
00432 
00433   return false;
00434 }
00435 
00436 bool parseToken( const char * &scursor, const char * const send,
00437                  QPair<const char*,int> &result, bool allow8Bit )
00438 {
00439   bool success = false;
00440   const char * start = scursor;
00441 
00442   while ( scursor != send ) {
00443     signed char ch = *scursor++;
00444     if ( ch > 0 && isTText( ch ) ) {
00445       // TText: OK
00446       success = true;
00447     } else if ( allow8Bit && ch < 0 ) {
00448       // 8bit char: not OK, but be tolerant.
00449       KMIME_WARN_8BIT( ch );
00450       success = true;
00451     } else {
00452       // CTL or tspecial - marking the end of the atom:
00453       // re-set sursor to point to the offending
00454       // char and return:
00455       scursor--;
00456       break;
00457     }
00458   }
00459   result.first = start;
00460   result.second = scursor - start;
00461   return success;
00462 }
00463 
00464 #define READ_ch_OR_FAIL if ( scursor == send ) {        \
00465     KMIME_WARN_PREMATURE_END_OF( GenericQuotedString ); \
00466     return false;                                       \
00467   } else {                                              \
00468     ch = *scursor++;                                    \
00469   }
00470 
00471 // known issues:
00472 //
00473 // - doesn't handle quoted CRLF
00474 
00475 bool parseGenericQuotedString( const char* &scursor, const char * const send,
00476                                QString &result, bool isCRLF,
00477                                const char openChar, const char closeChar )
00478 {
00479   char ch;
00480   // We are in a quoted-string or domain-literal or comment and the
00481   // cursor points to the first char after the openChar.
00482   // We will apply unfolding and quoted-pair removal.
00483   // We return when we either encounter the end or unescaped openChar
00484   // or closeChar.
00485 
00486   assert( *(scursor-1) == openChar || *(scursor-1) == closeChar );
00487 
00488   while ( scursor != send ) {
00489     ch = *scursor++;
00490 
00491     if ( ch == closeChar || ch == openChar ) {
00492       // end of quoted-string or another opening char:
00493       // let caller decide what to do.
00494       return true;
00495     }
00496 
00497     switch( ch ) {
00498     case '\\':      // quoted-pair
00499       // misses "\" CRLF LWSP-char handling, see rfc822, 3.4.5
00500       READ_ch_OR_FAIL;
00501       KMIME_WARN_IF_8BIT( ch );
00502       result += QChar( ch );
00503       break;
00504     case '\r':
00505       // ###
00506       // The case of lonely '\r' is easy to solve, as they're
00507       // not part of Unix Line-ending conventions.
00508       // But I see a problem if we are given Unix-native
00509       // line-ending-mails, where we cannot determine anymore
00510       // whether a given '\n' was part of a CRLF or was occurring
00511       // on it's own.
00512       READ_ch_OR_FAIL;
00513       if ( ch != '\n' ) {
00514         // CR on it's own...
00515         KMIME_WARN_LONE( CR );
00516         result += QChar('\r');
00517         scursor--; // points to after the '\r' again
00518       } else {
00519         // CRLF encountered.
00520         // lookahead: check for folding
00521         READ_ch_OR_FAIL;
00522         if ( ch == ' ' || ch == '\t' ) {
00523           // correct folding;
00524           // position cursor behind the CRLF WSP (unfolding)
00525           // and add the WSP to the result
00526           result += QChar( ch );
00527         } else {
00528           // this is the "shouldn't happen"-case. There is a CRLF
00529           // inside a quoted-string without it being part of FWS.
00530           // We take it verbatim.
00531           KMIME_WARN_NON_FOLDING( CRLF );
00532           result += "\r\n";
00533           // the cursor is decremented again, so's we need not
00534           // duplicate the whole switch here. "ch" could've been
00535           // everything (incl. openChar or closeChar).
00536           scursor--;
00537         }
00538       }
00539       break;
00540     case '\n':
00541       // Note: CRLF has been handled above already!
00542       // ### LF needs special treatment, depending on whether isCRLF
00543       // is true (we can be sure a lonely '\n' was meant this way) or
00544       // false ('\n' alone could have meant LF or CRLF in the original
00545       // message. This parser assumes CRLF iff the LF is followed by
00546       // either WSP (folding) or NULL (premature end of quoted-string;
00547       // Should be fixed, since NULL is allowed as per rfc822).
00548       READ_ch_OR_FAIL;
00549       if ( !isCRLF && ( ch == ' ' || ch == '\t' ) ) {
00550         // folding
00551         // correct folding
00552         result += QChar( ch );
00553       } else {
00554         // non-folding
00555         KMIME_WARN_LONE( LF );
00556         result += QChar('\n');
00557         // pos is decremented, so's we need not duplicate the whole
00558         // switch here. ch could've been everything (incl. <">, "\").
00559         scursor--;
00560       }
00561       break;
00562     default:
00563       KMIME_WARN_IF_8BIT( ch );
00564       result += QChar( ch );
00565     }
00566   }
00567 
00568   return false;
00569 }
00570 
00571 // known issues:
00572 //
00573 // - doesn't handle encoded-word inside comments.
00574 
00575 bool parseComment( const char* &scursor, const char * const send,
00576                    QString &result, bool isCRLF, bool reallySave )
00577 {
00578   int commentNestingDepth = 1;
00579   const char *afterLastClosingParenPos = 0;
00580   QString maybeCmnt;
00581   const char *oldscursor = scursor;
00582 
00583   assert( *(scursor-1) == '(' );
00584 
00585   while ( commentNestingDepth ) {
00586     QString cmntPart;
00587     if ( parseGenericQuotedString( scursor, send, cmntPart, isCRLF, '(', ')' ) ) {
00588       assert( *(scursor-1) == ')' || *(scursor-1) == '(' );
00589       // see the kdoc for above function for the possible conditions
00590       // we have to check:
00591       switch ( *(scursor-1) ) {
00592       case ')':
00593         if ( reallySave ) {
00594           // add the chunk that's now surely inside the comment.
00595           result += maybeCmnt;
00596           result += cmntPart;
00597           if ( commentNestingDepth > 1 ) {
00598             // don't add the outermost ')'...
00599             result += QChar(')');
00600           }
00601           maybeCmnt.clear();
00602         }
00603         afterLastClosingParenPos = scursor;
00604         --commentNestingDepth;
00605         break;
00606       case '(':
00607         if ( reallySave ) {
00608           // don't add to "result" yet, because we might find that we
00609           // are already outside the (broken) comment...
00610           maybeCmnt += cmntPart;
00611           maybeCmnt += QChar('(');
00612         }
00613         ++commentNestingDepth;
00614         break;
00615       default: assert( 0 );
00616       } // switch
00617     } else {
00618       // !parseGenericQuotedString, ie. premature end
00619       if ( afterLastClosingParenPos ) {
00620         scursor = afterLastClosingParenPos;
00621       } else {
00622         scursor = oldscursor;
00623       }
00624       return false;
00625     }
00626   } // while
00627 
00628   return true;
00629 }
00630 
00631 // known issues: none.
00632 
00633 bool parsePhrase( const char* &scursor, const char * const send,
00634                   QString &result, bool isCRLF )
00635 {
00636   enum {
00637     None, Phrase, Atom, EncodedWord, QuotedString
00638   } found = None;
00639 
00640   QString tmp;
00641   QByteArray lang, charset;
00642   const char *successfullyParsed = 0;
00643   // only used by the encoded-word branch
00644   const char *oldscursor;
00645   // used to suppress whitespace between adjacent encoded-words
00646   // (rfc2047, 6.2):
00647   bool lastWasEncodedWord = false;
00648 
00649   while ( scursor != send ) {
00650     char ch = *scursor++;
00651     switch ( ch ) {
00652     case '.': // broken, but allow for intorop's sake
00653       if ( found == None ) {
00654         --scursor;
00655         return false;
00656       } else {
00657         if ( scursor != send && ( *scursor == ' ' || *scursor == '\t' ) ) {
00658           result += ". ";
00659         } else {
00660           result += '.';
00661         }
00662         successfullyParsed = scursor;
00663       }
00664       break;
00665     case '"': // quoted-string
00666       tmp.clear();
00667       if ( parseGenericQuotedString( scursor, send, tmp, isCRLF, '"', '"' ) ) {
00668         successfullyParsed = scursor;
00669         assert( *(scursor-1) == '"' );
00670         switch ( found ) {
00671         case None:
00672           found = QuotedString;
00673           break;
00674         case Phrase:
00675         case Atom:
00676         case EncodedWord:
00677         case QuotedString:
00678           found = Phrase;
00679           result += QChar(' '); // rfc822, 3.4.4
00680           break;
00681         default:
00682           assert( 0 );
00683         }
00684         lastWasEncodedWord = false;
00685         result += tmp;
00686       } else {
00687         // premature end of quoted string.
00688         // What to do? Return leading '"' as special? Return as quoted-string?
00689         // We do the latter if we already found something, else signal failure.
00690         if ( found == None ) {
00691           return false;
00692         } else {
00693           result += QChar(' '); // rfc822, 3.4.4
00694           result += tmp;
00695           return true;
00696         }
00697       }
00698       break;
00699     case '(': // comment
00700       // parse it, but ignore content:
00701       tmp.clear();
00702       if ( parseComment( scursor, send, tmp, isCRLF,
00703                          false /*don't bother with the content*/ ) ) {
00704         successfullyParsed = scursor;
00705         lastWasEncodedWord = false; // strictly interpreting rfc2047, 6.2
00706       } else {
00707         if ( found == None ) {
00708           return false;
00709         } else {
00710           scursor = successfullyParsed;
00711           return true;
00712         }
00713       }
00714       break;
00715     case '=': // encoded-word
00716       tmp.clear();
00717       oldscursor = scursor;
00718       lang.clear();
00719       charset.clear();
00720       if ( parseEncodedWord( scursor, send, tmp, lang, charset ) ) {
00721         successfullyParsed = scursor;
00722         switch ( found ) {
00723         case None:
00724           found = EncodedWord;
00725           break;
00726         case Phrase:
00727         case EncodedWord:
00728         case Atom:
00729         case QuotedString:
00730           if ( !lastWasEncodedWord ) {
00731             result += QChar(' '); // rfc822, 3.4.4
00732           }
00733           found = Phrase;
00734           break;
00735         default: assert( 0 );
00736         }
00737         lastWasEncodedWord = true;
00738         result += tmp;
00739         break;
00740       } else {
00741         // parse as atom:
00742         scursor = oldscursor;
00743       }
00744       // fall though...
00745 
00746     default: //atom
00747       tmp.clear();
00748       scursor--;
00749       if ( parseAtom( scursor, send, tmp, true /* allow 8bit */ ) ) {
00750         successfullyParsed = scursor;
00751         switch ( found ) {
00752         case None:
00753           found = Atom;
00754           break;
00755         case Phrase:
00756         case Atom:
00757         case EncodedWord:
00758         case QuotedString:
00759           found = Phrase;
00760           result += QChar(' '); // rfc822, 3.4.4
00761           break;
00762         default:
00763           assert( 0 );
00764         }
00765         lastWasEncodedWord = false;
00766         result += tmp;
00767       } else {
00768         if ( found == None ) {
00769           return false;
00770         } else {
00771           scursor = successfullyParsed;
00772           return true;
00773         }
00774       }
00775     }
00776     eatWhiteSpace( scursor, send );
00777   }
00778 
00779   return found != None;
00780 }
00781 
00782 bool parseDotAtom( const char* &scursor, const char * const send,
00783                    QString &result, bool isCRLF )
00784 {
00785   eatCFWS( scursor, send, isCRLF );
00786 
00787   // always points to just after the last atom parsed:
00788   const char *successfullyParsed;
00789 
00790   QString tmp;
00791   if ( !parseAtom( scursor, send, tmp, false /* no 8bit */ ) ) {
00792     return false;
00793   }
00794   result += tmp;
00795   successfullyParsed = scursor;
00796 
00797   while ( scursor != send ) {
00798 
00799     // end of header or no '.' -> return
00800     if ( scursor == send || *scursor != '.' ) {
00801       return true;
00802     }
00803     scursor++; // eat '.'
00804 
00805     if ( scursor == send || !isAText( *scursor ) ) {
00806       // end of header or no AText, but this time following a '.'!:
00807       // reset cursor to just after last successfully parsed char and
00808       // return:
00809       scursor = successfullyParsed;
00810       return true;
00811     }
00812 
00813     // try to parse the next atom:
00814     QString maybeAtom;
00815     if ( !parseAtom( scursor, send, maybeAtom, false /*no 8bit*/ ) ) {
00816       scursor = successfullyParsed;
00817       return true;
00818     }
00819 
00820     result += QChar('.');
00821     result += maybeAtom;
00822     successfullyParsed = scursor;
00823   }
00824 
00825   scursor = successfullyParsed;
00826   return true;
00827 }
00828 
00829 void eatCFWS( const char* &scursor, const char * const send, bool isCRLF )
00830 {
00831   QString dummy;
00832 
00833   while ( scursor != send ) {
00834     const char *oldscursor = scursor;
00835 
00836     char ch = *scursor++;
00837 
00838     switch( ch ) {
00839     case ' ':
00840     case '\t': // whitespace
00841     case '\r':
00842     case '\n': // folding
00843       continue;
00844 
00845     case '(': // comment
00846       if ( parseComment( scursor, send, dummy, isCRLF, false /*don't save*/ ) ) {
00847         continue;
00848       }
00849       scursor = oldscursor;
00850       return;
00851 
00852     default:
00853       scursor = oldscursor;
00854       return;
00855     }
00856   }
00857 }
00858 
00859 bool parseDomain( const char* &scursor, const char * const send,
00860                   QString &result, bool isCRLF )
00861 {
00862   eatCFWS( scursor, send, isCRLF );
00863   if ( scursor == send ) {
00864     return false;
00865   }
00866 
00867   // domain := dot-atom / domain-literal / atom *("." atom)
00868   //
00869   // equivalent to:
00870   // domain = dot-atom / domain-literal,
00871   // since parseDotAtom does allow CFWS between atoms and dots
00872 
00873   if ( *scursor == '[' ) {
00874     // domain-literal:
00875     QString maybeDomainLiteral;
00876     // eat '[':
00877     scursor++;
00878     while ( parseGenericQuotedString( scursor, send, maybeDomainLiteral,
00879                                       isCRLF, '[', ']' ) ) {
00880       if ( scursor == send ) {
00881         // end of header: check for closing ']':
00882         if ( *(scursor-1) == ']' ) {
00883           // OK, last char was ']':
00884           result = maybeDomainLiteral;
00885           return true;
00886         } else {
00887           // not OK, domain-literal wasn't closed:
00888           return false;
00889         }
00890       }
00891       // we hit openChar in parseGenericQuotedString.
00892       // include it in maybeDomainLiteral and keep on parsing:
00893       if ( *(scursor-1) == '[' ) {
00894         maybeDomainLiteral += QChar('[');
00895         continue;
00896       }
00897       // OK, real end of domain-literal:
00898       result = maybeDomainLiteral;
00899       return true;
00900     }
00901   } else {
00902     // dot-atom:
00903     QString maybeDotAtom;
00904     if ( parseDotAtom( scursor, send, maybeDotAtom, isCRLF ) ) {
00905       result = maybeDotAtom;
00906       return true;
00907     }
00908   }
00909   return false;
00910 }
00911 
00912 bool parseObsRoute( const char* &scursor, const char* const send,
00913                     QStringList &result, bool isCRLF, bool save )
00914 {
00915   while ( scursor != send ) {
00916     eatCFWS( scursor, send, isCRLF );
00917     if ( scursor == send ) {
00918       return false;
00919     }
00920 
00921     // empty entry:
00922     if ( *scursor == ',' ) {
00923       scursor++;
00924       if ( save ) {
00925         result.append( QString() );
00926       }
00927       continue;
00928     }
00929 
00930     // empty entry ending the list:
00931     if ( *scursor == ':' ) {
00932       scursor++;
00933       if ( save ) {
00934         result.append( QString() );
00935       }
00936       return true;
00937     }
00938 
00939     // each non-empty entry must begin with '@':
00940     if ( *scursor != '@' ) {
00941       return false;
00942     } else {
00943       scursor++;
00944     }
00945 
00946     QString maybeDomain;
00947     if ( !parseDomain( scursor, send, maybeDomain, isCRLF ) ) {
00948       return false;
00949     }
00950     if ( save ) {
00951       result.append( maybeDomain );
00952     }
00953 
00954     // eat the following (optional) comma:
00955     eatCFWS( scursor, send, isCRLF );
00956     if ( scursor == send ) {
00957       return false;
00958     }
00959     if ( *scursor == ':' ) {
00960       scursor++;
00961       return true;
00962     }
00963     if ( *scursor == ',' ) {
00964       scursor++;
00965     }
00966   }
00967 
00968   return false;
00969 }
00970 
00971 bool parseAddrSpec( const char* &scursor, const char * const send,
00972                     AddrSpec &result, bool isCRLF )
00973 {
00974   //
00975   // STEP 1:
00976   // local-part := dot-atom / quoted-string / word *("." word)
00977   //
00978   // this is equivalent to:
00979   // local-part := word *("." word)
00980 
00981   QString maybeLocalPart;
00982   QString tmp;
00983 
00984   while ( scursor != send ) {
00985     // first, eat any whitespace
00986     eatCFWS( scursor, send, isCRLF );
00987 
00988     char ch = *scursor++;
00989     switch ( ch ) {
00990     case '.': // dot
00991       maybeLocalPart += QChar('.');
00992       break;
00993 
00994     case '@':
00995       goto SAW_AT_SIGN;
00996       break;
00997 
00998     case '"': // quoted-string
00999       tmp.clear();
01000       if ( parseGenericQuotedString( scursor, send, tmp, isCRLF, '"', '"' ) ) {
01001         maybeLocalPart += tmp;
01002       } else {
01003         return false;
01004       }
01005       break;
01006 
01007     default: // atom
01008       scursor--; // re-set scursor to point to ch again
01009       tmp.clear();
01010       if ( parseAtom( scursor, send, tmp, false /* no 8bit */ ) ) {
01011         maybeLocalPart += tmp;
01012       } else {
01013         return false; // parseAtom can only fail if the first char is non-atext.
01014       }
01015       break;
01016     }
01017   }
01018 
01019   return false;
01020 
01021   //
01022   // STEP 2:
01023   // domain
01024   //
01025 
01026 SAW_AT_SIGN:
01027 
01028   assert( *(scursor-1) == '@' );
01029 
01030   QString maybeDomain;
01031   if ( !parseDomain( scursor, send, maybeDomain, isCRLF ) ) {
01032     return false;
01033   }
01034 
01035   result.localPart = maybeLocalPart;
01036   result.domain = maybeDomain;
01037 
01038   return true;
01039 }
01040 
01041 bool parseAngleAddr( const char* &scursor, const char * const send,
01042                      AddrSpec &result, bool isCRLF )
01043 {
01044   // first, we need an opening angle bracket:
01045   eatCFWS( scursor, send, isCRLF );
01046   if ( scursor == send || *scursor != '<' ) {
01047     return false;
01048   }
01049   scursor++; // eat '<'
01050 
01051   eatCFWS( scursor, send, isCRLF );
01052   if ( scursor == send ) {
01053     return false;
01054   }
01055 
01056   if ( *scursor == '@' || *scursor == ',' ) {
01057     // obs-route: parse, but ignore:
01058     KMIME_WARN << "obsolete source route found! ignoring.";
01059     QStringList dummy;
01060     if ( !parseObsRoute( scursor, send, dummy,
01061                          isCRLF, false /* don't save */ ) ) {
01062       return false;
01063     }
01064     // angle-addr isn't complete until after the '>':
01065     if ( scursor == send ) {
01066       return false;
01067     }
01068   }
01069 
01070   // parse addr-spec:
01071   AddrSpec maybeAddrSpec;
01072   if ( !parseAddrSpec( scursor, send, maybeAddrSpec, isCRLF ) ) {
01073     return false;
01074   }
01075 
01076   eatCFWS( scursor, send, isCRLF );
01077   if ( scursor == send || *scursor != '>' ) {
01078     return false;
01079   }
01080   scursor++;
01081 
01082   result = maybeAddrSpec;
01083   return true;
01084 
01085 }
01086 
01087 bool parseMailbox( const char* &scursor, const char * const send,
01088                    Mailbox &result, bool isCRLF )
01089 {
01090   eatCFWS( scursor, send, isCRLF );
01091   if ( scursor == send ) {
01092     return false;
01093   }
01094 
01095   AddrSpec maybeAddrSpec;
01096   QString maybeDisplayName;
01097 
01098   // first, try if it's a vanilla addr-spec:
01099   const char * oldscursor = scursor;
01100   if ( parseAddrSpec( scursor, send, maybeAddrSpec, isCRLF ) ) {
01101     result.setAddress( maybeAddrSpec );
01102     // check for the obsolete form of display-name (as comment):
01103     eatWhiteSpace( scursor, send );
01104     if ( scursor != send && *scursor == '(' ) {
01105       scursor++;
01106       if ( !parseComment( scursor, send, maybeDisplayName, isCRLF, true /*keep*/ ) ) {
01107         return false;
01108       }
01109     }
01110     result.setNameFrom7Bit( maybeDisplayName.toLatin1() );
01111     return true;
01112   }
01113   scursor = oldscursor;
01114 
01115   // second, see if there's a display-name:
01116   if ( !parsePhrase( scursor, send, maybeDisplayName, isCRLF ) ) {
01117     // failed: reset cursor, note absent display-name
01118     maybeDisplayName.clear();
01119     scursor = oldscursor;
01120   } else {
01121     // succeeded: eat CFWS
01122     eatCFWS( scursor, send, isCRLF );
01123     if ( scursor == send ) {
01124       return false;
01125     }
01126   }
01127 
01128   // third, parse the angle-addr:
01129   if ( !parseAngleAddr( scursor, send, maybeAddrSpec, isCRLF ) ) {
01130     return false;
01131   }
01132 
01133   if ( maybeDisplayName.isNull() ) {
01134     // check for the obsolete form of display-name (as comment):
01135     eatWhiteSpace( scursor, send );
01136     if ( scursor != send && *scursor == '(' ) {
01137       scursor++;
01138       if ( !parseComment( scursor, send, maybeDisplayName, isCRLF, true /*keep*/ ) ) {
01139         return false;
01140       }
01141     }
01142   }
01143 
01144   result.setName( maybeDisplayName );
01145   result.setAddress( maybeAddrSpec );
01146   return true;
01147 }
01148 
01149 bool parseGroup( const char* &scursor, const char * const send,
01150                  Address &result, bool isCRLF )
01151 {
01152   // group         := display-name ":" [ mailbox-list / CFWS ] ";" [CFWS]
01153   //
01154   // equivalent to:
01155   // group   := display-name ":" [ obs-mbox-list ] ";"
01156 
01157   eatCFWS( scursor, send, isCRLF );
01158   if ( scursor == send ) {
01159     return false;
01160   }
01161 
01162   // get display-name:
01163   QString maybeDisplayName;
01164   if ( !parsePhrase( scursor, send, maybeDisplayName, isCRLF ) ) {
01165     return false;
01166   }
01167 
01168   // get ":":
01169   eatCFWS( scursor, send, isCRLF );
01170   if ( scursor == send || *scursor != ':' ) {
01171     return false;
01172   }
01173 
01174   result.displayName = maybeDisplayName;
01175 
01176   // get obs-mbox-list (may contain empty entries):
01177   scursor++;
01178   while ( scursor != send ) {
01179     eatCFWS( scursor, send, isCRLF );
01180     if ( scursor == send ) {
01181       return false;
01182     }
01183 
01184     // empty entry:
01185     if ( *scursor == ',' ) {
01186       scursor++;
01187       continue;
01188     }
01189 
01190     // empty entry ending the list:
01191     if ( *scursor == ';' ) {
01192       scursor++;
01193       return true;
01194     }
01195 
01196     Mailbox maybeMailbox;
01197     if ( !parseMailbox( scursor, send, maybeMailbox, isCRLF ) ) {
01198       return false;
01199     }
01200     result.mailboxList.append( maybeMailbox );
01201 
01202     eatCFWS( scursor, send, isCRLF );
01203     // premature end:
01204     if ( scursor == send ) {
01205       return false;
01206     }
01207     // regular end of the list:
01208     if ( *scursor == ';' ) {
01209       scursor++;
01210       return true;
01211     }
01212     // eat regular list entry separator:
01213     if ( *scursor == ',' ) {
01214       scursor++;
01215     }
01216   }
01217   return false;
01218 }
01219 
01220 bool parseAddress( const char* &scursor, const char * const send,
01221                    Address &result, bool isCRLF )
01222 {
01223   // address       := mailbox / group
01224 
01225   eatCFWS( scursor, send, isCRLF );
01226   if ( scursor == send ) {
01227     return false;
01228   }
01229 
01230   // first try if it's a single mailbox:
01231   Mailbox maybeMailbox;
01232   const char * oldscursor = scursor;
01233   if ( parseMailbox( scursor, send, maybeMailbox, isCRLF ) ) {
01234     // yes, it is:
01235     result.displayName.clear();
01236     result.mailboxList.append( maybeMailbox );
01237     return true;
01238   }
01239   scursor = oldscursor;
01240 
01241   Address maybeAddress;
01242 
01243   // no, it's not a single mailbox. Try if it's a group:
01244   if ( !parseGroup( scursor, send, maybeAddress, isCRLF ) ) {
01245     return false;
01246   }
01247 
01248   result = maybeAddress;
01249   return true;
01250 }
01251 
01252 bool parseAddressList( const char* &scursor, const char * const send,
01253                        AddressList &result, bool isCRLF )
01254 {
01255   while ( scursor != send ) {
01256     eatCFWS( scursor, send, isCRLF );
01257     // end of header: this is OK.
01258     if ( scursor == send ) {
01259       return true;
01260     }
01261     // empty entry: ignore:
01262     if ( *scursor == ',' ) {
01263       scursor++;
01264       continue;
01265     }
01266     // broken clients might use ';' as list delimiter, accept that as well
01267     if ( *scursor == ';' ) {
01268       scursor++;
01269       continue;
01270     }
01271 
01272     // parse one entry
01273     Address maybeAddress;
01274     if ( !parseAddress( scursor, send, maybeAddress, isCRLF ) ) {
01275       return false;
01276     }
01277     result.append( maybeAddress );
01278 
01279     eatCFWS( scursor, send, isCRLF );
01280     // end of header: this is OK.
01281     if ( scursor == send ) {
01282       return true;
01283     }
01284     // comma separating entries: eat it.
01285     if ( *scursor == ',' ) {
01286       scursor++;
01287     }
01288   }
01289   return true;
01290 }
01291 
01292 static QString asterisk = QString::fromLatin1( "*0*", 1 );
01293 static QString asteriskZero = QString::fromLatin1( "*0*", 2 );
01294 //static QString asteriskZeroAsterisk = QString::fromLatin1( "*0*", 3 );
01295 
01296 bool parseParameter( const char* &scursor, const char * const send,
01297                      QPair<QString,QStringOrQPair> &result, bool isCRLF )
01298 {
01299   // parameter = regular-parameter / extended-parameter
01300   // regular-parameter = regular-parameter-name "=" value
01301   // extended-parameter =
01302   // value = token / quoted-string
01303   //
01304   // note that rfc2231 handling is out of the scope of this function.
01305   // Therefore we return the attribute as QString and the value as
01306   // (start,length) tupel if we see that the value is encoded
01307   // (trailing asterisk), for parseParameterList to decode...
01308 
01309   eatCFWS( scursor, send, isCRLF );
01310   if ( scursor == send ) {
01311     return false;
01312   }
01313 
01314   //
01315   // parse the parameter name:
01316   //
01317   QString maybeAttribute;
01318   if ( !parseToken( scursor, send, maybeAttribute, false /* no 8bit */ ) ) {
01319     return false;
01320   }
01321 
01322   eatCFWS( scursor, send, isCRLF );
01323   // premature end: not OK (haven't seen '=' yet).
01324   if ( scursor == send || *scursor != '=' ) {
01325     return false;
01326   }
01327   scursor++; // eat '='
01328 
01329   eatCFWS( scursor, send, isCRLF );
01330   if ( scursor == send ) {
01331     // don't choke on attribute=, meaning the value was omitted:
01332     if ( maybeAttribute.endsWith( asterisk ) ) {
01333       KMIME_WARN << "attribute ends with \"*\", but value is empty!"
01334         "Chopping away \"*\".";
01335       maybeAttribute.truncate( maybeAttribute.length() - 1 );
01336     }
01337     result = qMakePair( maybeAttribute.toLower(), QStringOrQPair() );
01338     return true;
01339   }
01340 
01341   const char * oldscursor = scursor;
01342 
01343   //
01344   // parse the parameter value:
01345   //
01346   QStringOrQPair maybeValue;
01347   if ( *scursor == '"' ) {
01348     // value is a quoted-string:
01349     scursor++;
01350     if ( maybeAttribute.endsWith( asterisk ) ) {
01351       // attributes ending with "*" designate extended-parameters,
01352       // which cannot have quoted-strings as values. So we remove the
01353       // trailing "*" to not confuse upper layers.
01354       KMIME_WARN << "attribute ends with \"*\", but value is a quoted-string!"
01355         "Chopping away \"*\".";
01356       maybeAttribute.truncate( maybeAttribute.length() - 1 );
01357     }
01358 
01359     if ( !parseGenericQuotedString( scursor, send, maybeValue.qstring, isCRLF ) ) {
01360       scursor = oldscursor;
01361       result = qMakePair( maybeAttribute.toLower(), QStringOrQPair() );
01362       return false; // this case needs further processing by upper layers!!
01363     }
01364   } else {
01365     // value is a token:
01366     if ( !parseToken( scursor, send, maybeValue.qpair, false /* no 8bit */ ) ) {
01367       scursor = oldscursor;
01368       result = qMakePair( maybeAttribute.toLower(), QStringOrQPair() );
01369       return false; // this case needs further processing by upper layers!!
01370     }
01371   }
01372 
01373   result = qMakePair( maybeAttribute.toLower(), maybeValue );
01374   return true;
01375 }
01376 
01377 bool parseRawParameterList( const char* &scursor, const char * const send,
01378                             QMap<QString,QStringOrQPair> &result,
01379                             bool isCRLF )
01380 {
01381   // we use parseParameter() consecutively to obtain a map of raw
01382   // attributes to raw values. "Raw" here means that we don't do
01383   // rfc2231 decoding and concatenation. This is left to
01384   // parseParameterList(), which will call this function.
01385   //
01386   // The main reason for making this chunk of code a separate
01387   // (private) method is that we can deal with broken parameters
01388   // _here_ and leave the rfc2231 handling solely to
01389   // parseParameterList(), which will still be enough work.
01390 
01391   while ( scursor != send ) {
01392     eatCFWS( scursor, send, isCRLF );
01393     // empty entry ending the list: OK.
01394     if ( scursor == send ) {
01395       return true;
01396     }
01397     // empty list entry: ignore.
01398     if ( *scursor == ';' ) {
01399       scursor++;
01400       continue;
01401     }
01402 
01403     QPair<QString,QStringOrQPair> maybeParameter;
01404     if ( !parseParameter( scursor, send, maybeParameter, isCRLF ) ) {
01405       // we need to do a bit of work if the attribute is not
01406       // NULL. These are the cases marked with "needs further
01407       // processing" in parseParameter(). Specifically, parsing of the
01408       // token or the quoted-string, which should represent the value,
01409       // failed. We take the easy way out and simply search for the
01410       // next ';' to start parsing again. (Another option would be to
01411       // take the text between '=' and ';' as value)
01412       if ( maybeParameter.first.isNull() ) {
01413         return false;
01414       }
01415       while ( scursor != send ) {
01416         if ( *scursor++ == ';' ) {
01417           goto IS_SEMICOLON;
01418         }
01419       }
01420       // scursor == send case: end of list.
01421       return true;
01422     IS_SEMICOLON:
01423       // *scursor == ';' case: parse next entry.
01424       continue;
01425     }
01426     // successful parsing brings us here:
01427     result.insert( maybeParameter.first, maybeParameter.second );
01428 
01429     eatCFWS( scursor, send, isCRLF );
01430     // end of header: ends list.
01431     if ( scursor == send ) {
01432       return true;
01433     }
01434     // regular separator: eat it.
01435     if ( *scursor == ';' ) {
01436       scursor++;
01437     }
01438   }
01439   return true;
01440 }
01441 
01442 static void decodeRFC2231Value( Codec* &rfc2231Codec,
01443                                 QTextCodec* &textcodec,
01444                                 bool isContinuation, QString &value,
01445                                 QPair<const char*,int> &source )
01446 {
01447   //
01448   // parse the raw value into (charset,language,text):
01449   //
01450 
01451   const char * decBegin = source.first;
01452   const char * decCursor = decBegin;
01453   const char * decEnd = decCursor + source.second;
01454 
01455   if ( !isContinuation ) {
01456     // find the first single quote
01457     while ( decCursor != decEnd ) {
01458       if ( *decCursor == '\'' ) {
01459         break;
01460       } else {
01461         decCursor++;
01462       }
01463     }
01464 
01465     if ( decCursor == decEnd ) {
01466       // there wasn't a single single quote at all!
01467       // take the whole value to be in latin-1:
01468       KMIME_WARN << "No charset in extended-initial-value."
01469         "Assuming \"iso-8859-1\".";
01470       value += QString::fromLatin1( decBegin, source.second );
01471       return;
01472     }
01473 
01474     QByteArray charset( decBegin, decCursor - decBegin );
01475 
01476     const char * oldDecCursor = ++decCursor;
01477     // find the second single quote (we ignore the language tag):
01478     while ( decCursor != decEnd ) {
01479       if ( *decCursor == '\'' ) {
01480         break;
01481       } else {
01482         decCursor++;
01483       }
01484     }
01485     if ( decCursor == decEnd ) {
01486       KMIME_WARN << "No language in extended-initial-value."
01487         "Trying to recover.";
01488       decCursor = oldDecCursor;
01489     } else {
01490       decCursor++;
01491     }
01492 
01493     // decCursor now points to the start of the
01494     // "extended-other-values":
01495 
01496     //
01497     // get the decoders:
01498     //
01499 
01500     bool matchOK = false;
01501     textcodec = KGlobal::charsets()->codecForName( charset, matchOK );
01502     if ( !matchOK ) {
01503       textcodec = 0;
01504       KMIME_WARN_UNKNOWN( Charset, charset );
01505     }
01506   }
01507 
01508   if ( !rfc2231Codec ) {
01509     rfc2231Codec = Codec::codecForName("x-kmime-rfc2231");
01510     assert( rfc2231Codec );
01511   }
01512 
01513   if ( !textcodec ) {
01514     value += QString::fromLatin1( decCursor, decEnd - decCursor );
01515     return;
01516   }
01517 
01518   Decoder * dec = rfc2231Codec->makeDecoder();
01519   assert( dec );
01520 
01521   //
01522   // do the decoding:
01523   //
01524 
01525   QByteArray buffer;
01526   buffer.resize( rfc2231Codec->maxDecodedSizeFor( decEnd - decCursor ) );
01527   QByteArray::Iterator bit = buffer.begin();
01528   QByteArray::ConstIterator bend = buffer.end();
01529 
01530   if ( !dec->decode( decCursor, decEnd, bit, bend ) ) {
01531     KMIME_WARN << rfc2231Codec->name()
01532                << "codec lies about its maxDecodedSizeFor()" << endl
01533                << "result may be truncated";
01534   }
01535 
01536   value += textcodec->toUnicode( buffer.begin(), bit - buffer.begin() );
01537 
01538   kDebug(5320) << "value now: \"" << value << "\"";
01539   // cleanup:
01540   delete dec;
01541 }
01542 
01543 // known issues:
01544 //  - permutes rfc2231 continuations when the total number of parts
01545 //    exceeds 10 (other-sections then becomes *xy, ie. two digits)
01546 
01547 bool parseParameterList( const char* &scursor, const char * const send,
01548                          QMap<QString,QString> &result, bool isCRLF )
01549 {
01550   // parse the list into raw attribute-value pairs:
01551   QMap<QString,QStringOrQPair> rawParameterList;
01552   if (!parseRawParameterList( scursor, send, rawParameterList, isCRLF ) ) {
01553     return false;
01554   }
01555 
01556   if ( rawParameterList.isEmpty() ) {
01557     return true;
01558   }
01559 
01560   // decode rfc 2231 continuations and alternate charset encoding:
01561 
01562   // NOTE: this code assumes that what QMapIterator delivers is sorted
01563   // by the key!
01564 
01565   Codec * rfc2231Codec = 0;
01566   QTextCodec * textcodec = 0;
01567   QString attribute;
01568   QString value;
01569   enum Modes {
01570     NoMode = 0x0, Continued = 0x1, Encoded = 0x2
01571   } mode;
01572 
01573   QMap<QString,QStringOrQPair>::Iterator it, end = rawParameterList.end();
01574 
01575   for ( it = rawParameterList.begin() ; it != end ; ++it ) {
01576     if ( attribute.isNull() || !it.key().startsWith( attribute ) ) {
01577       //
01578       // new attribute:
01579       //
01580 
01581       // store the last attribute/value pair in the result map now:
01582       if ( !attribute.isNull() ) {
01583         result.insert( attribute, value );
01584       }
01585       // and extract the information from the new raw attribute:
01586       value.clear();
01587       attribute = it.key();
01588       mode = NoMode;
01589       // is the value encoded?
01590       if ( attribute.endsWith( asterisk ) ) {
01591         attribute.truncate( attribute.length() - 1 );
01592         mode = (Modes) ((int) mode | Encoded);
01593       }
01594       // is the value continued?
01595       if ( attribute.endsWith( asteriskZero ) ) {
01596         attribute.truncate( attribute.length() - 2 );
01597         mode = (Modes) ((int) mode | Continued);
01598       }
01599       //
01600       // decode if necessary:
01601       //
01602       if ( mode & Encoded ) {
01603         decodeRFC2231Value( rfc2231Codec, textcodec,
01604                             false, /* isn't continuation */
01605                             value, (*it).qpair );
01606       } else {
01607         // not encoded.
01608         if ( (*it).qpair.first ) {
01609           value += QString::fromLatin1( (*it).qpair.first, (*it).qpair.second );
01610         } else {
01611           value += (*it).qstring;
01612         }
01613       }
01614 
01615       //
01616       // shortcut-processing when the value isn't encoded:
01617       //
01618 
01619       if ( !(mode & Continued) ) {
01620         // save result already:
01621         result.insert( attribute, value );
01622         // force begin of a new attribute:
01623         attribute.clear();
01624       }
01625     } else { // it.key().startsWith( attribute )
01626       //
01627       // continuation
01628       //
01629 
01630       // ignore the section and trust QMap to have sorted the keys:
01631       if ( it.key().endsWith( asterisk ) ) {
01632         // encoded
01633         decodeRFC2231Value( rfc2231Codec, textcodec,
01634                             true, /* is continuation */
01635                             value, (*it).qpair );
01636       } else {
01637         // not encoded
01638         if ( (*it).qpair.first ) {
01639           value += QString::fromLatin1( (*it).qpair.first, (*it).qpair.second );
01640         } else {
01641           value += (*it).qstring;
01642         }
01643       }
01644     }
01645   }
01646 
01647   // write last attr/value pair:
01648   if ( !attribute.isNull() ) {
01649     result.insert( attribute, value );
01650   }
01651 
01652   return true;
01653 }
01654 
01655 static const char * stdDayNames[] = {
01656   "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
01657 };
01658 static const int stdDayNamesLen = sizeof stdDayNames / sizeof *stdDayNames;
01659 
01660 static bool parseDayName( const char* &scursor, const char * const send )
01661 {
01662   // check bounds:
01663   if ( send - scursor < 3 ) {
01664     return false;
01665   }
01666 
01667   for ( int i = 0 ; i < stdDayNamesLen ; ++i ) {
01668     if ( qstrnicmp( scursor, stdDayNames[i], 3 ) == 0 ) {
01669       scursor += 3;
01670       // kDebug(5320) << "found" << stdDayNames[i];
01671       return true;
01672     }
01673   }
01674 
01675   return false;
01676 }
01677 
01678 static const char * stdMonthNames[] = {
01679   "Jan", "Feb", "Mar", "Apr", "May", "Jun",
01680   "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
01681 };
01682 static const int stdMonthNamesLen =
01683                               sizeof stdMonthNames / sizeof *stdMonthNames;
01684 
01685 static bool parseMonthName( const char* &scursor, const char * const send,
01686                             int &result )
01687 {
01688   // check bounds:
01689   if ( send - scursor < 3 ) {
01690     return false;
01691   }
01692 
01693   for ( result = 0 ; result < stdMonthNamesLen ; ++result ) {
01694     if ( qstrnicmp( scursor, stdMonthNames[result], 3 ) == 0 ) {
01695       scursor += 3;
01696       return true;
01697     }
01698   }
01699 
01700   // not found:
01701   return false;
01702 }
01703 
01704 static const struct {
01705   const char * tzName;
01706   long int secsEastOfGMT;
01707 } timeZones[] = {
01708   // rfc 822 timezones:
01709   { "GMT", 0 },
01710   { "UT", 0 },
01711   { "EDT", -4*3600 },
01712   { "EST", -5*3600 },
01713   { "MST", -5*3600 },
01714   { "CST", -6*3600 },
01715   { "MDT", -6*3600 },
01716   { "MST", -7*3600 },
01717   { "PDT", -7*3600 },
01718   { "PST", -8*3600 },
01719   // common, non-rfc-822 zones:
01720   { "CET", 1*3600 },
01721   { "MET", 1*3600 },
01722   { "UTC", 0 },
01723   { "CEST", 2*3600 },
01724   { "BST", 1*3600 },
01725   // rfc 822 military timezones:
01726   { "Z", 0 },
01727   { "A", -1*3600 },
01728   { "B", -2*3600 },
01729   { "C", -3*3600 },
01730   { "D", -4*3600 },
01731   { "E", -5*3600 },
01732   { "F", -6*3600 },
01733   { "G", -7*3600 },
01734   { "H", -8*3600 },
01735   { "I", -9*3600 },
01736   // J is not used!
01737   { "K", -10*3600 },
01738   { "L", -11*3600 },
01739   { "M", -12*3600 },
01740   { "N", 1*3600 },
01741   { "O", 2*3600 },
01742   { "P", 3*3600 },
01743   { "Q", 4*3600 },
01744   { "R", 5*3600 },
01745   { "S", 6*3600 },
01746   { "T", 7*3600 },
01747   { "U", 8*3600 },
01748   { "V", 9*3600 },
01749   { "W", 10*3600 },
01750   { "X", 11*3600 },
01751   { "Y", 12*3600 },
01752 };
01753 static const int timeZonesLen = sizeof timeZones / sizeof *timeZones;
01754 
01755 static bool parseAlphaNumericTimeZone( const char* &scursor,
01756                                        const char * const send,
01757                                        long int &secsEastOfGMT,
01758                                        bool &timeZoneKnown )
01759 {
01760   QPair<const char*,int> maybeTimeZone( 0, 0 );
01761   if ( !parseToken( scursor, send, maybeTimeZone, false /*no 8bit*/ ) ) {
01762     return false;
01763   }
01764   for ( int i = 0 ; i < timeZonesLen ; ++i ) {
01765     if ( qstrnicmp( timeZones[i].tzName,
01766                     maybeTimeZone.first, maybeTimeZone.second ) == 0 ) {
01767       scursor += maybeTimeZone.second;
01768       secsEastOfGMT = timeZones[i].secsEastOfGMT;
01769       timeZoneKnown = true;
01770       return true;
01771     }
01772   }
01773 
01774   // don't choke just because we don't happen to know the time zone
01775   KMIME_WARN_UNKNOWN( time zone,
01776                       QByteArray( maybeTimeZone.first, maybeTimeZone.second ) );
01777   secsEastOfGMT = 0;
01778   timeZoneKnown = false;
01779   return true;
01780 }
01781 
01782 // parse a number and return the number of digits parsed:
01783 int parseDigits( const char* &scursor, const char * const send, int &result )
01784 {
01785   result = 0;
01786   int digits = 0;
01787   for ( ; scursor != send && isdigit( *scursor ) ; scursor++, digits++ ) {
01788     result *= 10;
01789     result += int( *scursor - '0' );
01790   }
01791   return digits;
01792 }
01793 
01794 static bool parseTimeOfDay( const char* &scursor, const char * const send,
01795                             int &hour, int &min, int &sec, bool isCRLF=false )
01796 {
01797   // time-of-day := 2DIGIT [CFWS] ":" [CFWS] 2DIGIT [ [CFWS] ":" 2DIGIT ]
01798 
01799   //
01800   // 2DIGIT representing "hour":
01801   //
01802   if ( !parseDigits( scursor, send, hour ) ) {
01803     return false;
01804   }
01805 
01806   eatCFWS( scursor, send, isCRLF );
01807   if ( scursor == send || *scursor != ':' ) {
01808     return false;
01809   }
01810   scursor++; // eat ':'
01811 
01812   eatCFWS( scursor, send, isCRLF );
01813   if ( scursor == send ) {
01814     return false;
01815   }
01816 
01817   //
01818   // 2DIGIT representing "minute":
01819   //
01820   if ( !parseDigits( scursor, send, min ) ) {
01821     return false;
01822   }
01823 
01824   eatCFWS( scursor, send, isCRLF );
01825   if ( scursor == send ) {
01826     return true; // seconds are optional
01827   }
01828 
01829   //
01830   // let's see if we have a 2DIGIT representing "second":
01831   //
01832   if ( *scursor == ':' ) {
01833     // yepp, there are seconds:
01834     scursor++; // eat ':'
01835     eatCFWS( scursor, send, isCRLF );
01836     if ( scursor == send ) {
01837       return false;
01838     }
01839 
01840     if ( !parseDigits( scursor, send, sec ) ) {
01841       return false;
01842     }
01843   } else {
01844     sec = 0;
01845   }
01846 
01847   return true;
01848 }
01849 
01850 bool parseTime( const char* &scursor, const char * send,
01851                 int &hour, int &min, int &sec, long int &secsEastOfGMT,
01852                 bool &timeZoneKnown, bool isCRLF )
01853 {
01854   // time := time-of-day CFWS ( zone / obs-zone )
01855   //
01856   // obs-zone    := "UT" / "GMT" /
01857   //                "EST" / "EDT" / ; -0500 / -0400
01858   //                "CST" / "CDT" / ; -0600 / -0500
01859   //                "MST" / "MDT" / ; -0700 / -0600
01860   //                "PST" / "PDT" / ; -0800 / -0700
01861   //                "A"-"I" / "a"-"i" /
01862   //                "K"-"Z" / "k"-"z"
01863 
01864   eatCFWS( scursor, send, isCRLF );
01865   if ( scursor == send ) {
01866     return false;
01867   }
01868 
01869   if ( !parseTimeOfDay( scursor, send, hour, min, sec, isCRLF ) ) {
01870     return false;
01871   }
01872 
01873   eatCFWS( scursor, send, isCRLF );
01874   if ( scursor == send ) {
01875     timeZoneKnown = false;
01876     secsEastOfGMT = 0;
01877     return true; // allow missing timezone
01878   }
01879 
01880   timeZoneKnown = true;
01881   if ( *scursor == '+' || *scursor == '-' ) {
01882     // remember and eat '-'/'+':
01883     const char sign = *scursor++;
01884     // numerical timezone:
01885     int maybeTimeZone;
01886     if ( parseDigits( scursor, send, maybeTimeZone ) != 4 ) {
01887       return false;
01888     }
01889     secsEastOfGMT = 60 * ( maybeTimeZone / 100 * 60 + maybeTimeZone % 100 );
01890     if ( sign == '-' ) {
01891       secsEastOfGMT *= -1;
01892       if ( secsEastOfGMT == 0 ) {
01893         timeZoneKnown = false; // -0000 means indetermined tz
01894       }
01895     }
01896   } else {
01897     // maybe alphanumeric timezone:
01898     if ( !parseAlphaNumericTimeZone( scursor, send, secsEastOfGMT, timeZoneKnown ) ) {
01899       return false;
01900     }
01901   }
01902   return true;
01903 }
01904 
01905 bool parseDateTime( const char* &scursor, const char * const send,
01906                     KDateTime &result, bool isCRLF )
01907 {
01908   // Parsing date-time; strict mode:
01909   //
01910   // date-time   := [ [CFWS] day-name [CFWS] "," ]                      ; wday
01911   // (expanded)     [CFWS] 1*2DIGIT CFWS month-name CFWS 2*DIGIT [CFWS] ; date
01912   //                time
01913   //
01914   // day-name    := "Mon" / "Tue" / "Wed" / "Thu" / "Fri" / "Sat" / "Sun"
01915   // month-name  := "Jan" / "Feb" / "Mar" / "Apr" / "May" / "Jun" /
01916   //                "Jul" / "Aug" / "Sep" / "Oct" / "Nov" / "Dec"
01917 
01918   result = KDateTime();
01919   QDateTime maybeDateTime;
01920 
01921   eatCFWS( scursor, send, isCRLF );
01922   if ( scursor == send ) {
01923     return false;
01924   }
01925 
01926   //
01927   // let's see if there's a day-of-week:
01928   //
01929   if ( parseDayName( scursor, send ) ) {
01930     eatCFWS( scursor, send, isCRLF );
01931     if ( scursor == send ) {
01932       return false;
01933     }
01934     // day-name should be followed by ',' but we treat it as optional:
01935     if ( *scursor == ',' ) {
01936       scursor++; // eat ','
01937       eatCFWS( scursor, send, isCRLF );
01938     }
01939   }
01940 
01941   //
01942   // 1*2DIGIT representing "day" (of month):
01943   //
01944   int maybeDay;
01945   if ( !parseDigits( scursor, send, maybeDay ) ) {
01946     return false;
01947   }
01948 
01949   eatCFWS( scursor, send, isCRLF );
01950   if ( scursor == send ) {
01951     return false;
01952   }
01953 
01954   //
01955   // month-name:
01956   //
01957   int maybeMonth = 0;
01958   if ( !parseMonthName( scursor, send, maybeMonth ) ) {
01959     return false;
01960   }
01961   if ( scursor == send ) {
01962     return false;
01963   }
01964   assert( maybeMonth >= 0 ); assert( maybeMonth <= 11 );
01965   ++maybeMonth; // 0-11 -> 1-12
01966 
01967   eatCFWS( scursor, send, isCRLF );
01968   if ( scursor == send ) {
01969     return false;
01970   }
01971 
01972   //
01973   // 2*DIGIT representing "year":
01974   //
01975   int maybeYear;
01976   if ( !parseDigits( scursor, send, maybeYear ) ) {
01977     return false;
01978   }
01979   // RFC 2822 4.3 processing:
01980   if ( maybeYear < 50 ) {
01981     maybeYear += 2000;
01982   } else if ( maybeYear < 1000 ) {
01983     maybeYear += 1900;
01984   }
01985   // else keep as is
01986   if ( maybeYear < 1900 ) {
01987     return false; // rfc2822, 3.3
01988   }
01989 
01990   eatCFWS( scursor, send, isCRLF );
01991   if ( scursor == send ) {
01992     return false;
01993   }
01994 
01995   maybeDateTime.setDate( QDate( maybeYear, maybeMonth, maybeDay ) );
01996 
01997   //
01998   // time
01999   //
02000   int maybeHour, maybeMinute, maybeSecond;
02001   long int secsEastOfGMT;
02002   bool timeZoneKnown = true;
02003 
02004   if ( !parseTime( scursor, send,
02005                    maybeHour, maybeMinute, maybeSecond,
02006                    secsEastOfGMT, timeZoneKnown, isCRLF ) ) {
02007     return false;
02008   }
02009 
02010   maybeDateTime.setTime( QTime( maybeHour, maybeMinute, maybeSecond ) );
02011   if ( !maybeDateTime.isValid() )
02012     return false;
02013 
02014   result = KDateTime( maybeDateTime, KDateTime::Spec( KDateTime::OffsetFromUTC, secsEastOfGMT ) );
02015   if ( !result.isValid() )
02016     return false;
02017   return true;
02018 }
02019 
02020 } // namespace HeaderParsing
02021 
02022 } // namespace KMime

KMIME Library

Skip menu "KMIME Library"
  • Main Page
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

KDE-PIM Libraries

Skip menu "KDE-PIM Libraries"
  • akonadi
  • kabc
  • kblog
  • kcal
  • kimap
  • kioslave
  •   imap4
  •   mbox
  • kldap
  • kmime
  • kpimidentities
  • kpimutils
  • kresources
  • ktnef
  • kxmlrpcclient
  • mailtransport
  • qgpgme
  • syndication
  •   atom
  •   rdf
  •   rss2
Generated for KDE-PIM Libraries by doxygen 1.5.6
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal