ispell_checker.cpp

00001 /* vim: set sw=8: -*- Mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
00002 /* kspell2 - adopted from Enchant
00003  * Copyright (C) 2003 Dom Lachowicz
00004  * Copyright (C) 2004 Zack Rusin <zack@kde.org>
00005  *
00006  * This library is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * This library is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with this library; if not, write to the
00018  * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00019  * Boston, MA 02110-1301, USA.
00020  *
00021  * In addition, as a special exception, Dom Lachowicz
00022  * gives permission to link the code of this program with
00023  * non-LGPL Spelling Provider libraries (eg: a MSFT Office
00024  * spell checker backend) and distribute linked combinations including
00025  * the two.  You must obey the GNU Lesser General Public License in all
00026  * respects for all of the code used other than said providers.  If you modify
00027  * this file, you may extend this exception to your version of the
00028  * file, but you are not obligated to do so.  If you do not wish to
00029  * do so, delete this exception statement from your version.
00030  */
00031 
00032 #include <stdio.h>
00033 #include <stdlib.h>
00034 #include <string.h>
00035 
00036 #include <string>
00037 #include <vector>
00038 
00039 #include "sp_spell.h"
00040 #include "ispell_checker.h"
00041 
00042 #include <qmap.h>
00043 #include <qdir.h>
00044 #include <qfileinfo.h>
00045 
00046 /***************************************************************************/
00047 
00048 typedef struct str_ispell_map
00049 {
00050     const char * lang;
00051     const char * dict;
00052     const char * enc;
00053 } IspellMap;
00054 
00055 static const char *ispell_dirs [] = {
00056     "/usr/lib/ispell",
00057     "/usr/local/lib/ispell",
00058     "/usr/local/share/ispell",
00059     "/usr/share/ispell",
00060     "/usr/pkg/lib",
00061     0
00062 };
00063 static const IspellMap ispell_map [] = {
00064     {"ca"    ,"catala.hash"         ,"iso-8859-1" },
00065     {"ca_ES" ,"catala.hash"         ,"iso-8859-1" },
00066     {"cs"    ,"czech.hash"          ,"iso-8859-2" },
00067     {"cs_CZ" ,"czech.hash"          ,"iso-8859-2" },
00068     {"da"    ,"dansk.hash"          ,"iso-8859-1" },
00069     {"da_DK" ,"dansk.hash"          ,"iso-8859-1" },
00070     {"de"    ,"deutsch.hash"        ,"iso-8859-1" },
00071     {"de_CH" ,"swiss.hash"          ,"iso-8859-1" },
00072     {"de_AT" ,"deutsch.hash"        ,"iso-8859-1" },
00073     {"de_DE" ,"deutsch.hash"        ,"iso-8859-1" },
00074     {"el"    ,"ellhnika.hash"       ,"iso-8859-7" },
00075     {"el_GR" ,"ellhnika.hash"       ,"iso-8859-7" },
00076     {"en"    ,"british.hash"        ,"iso-8859-1" },
00077     {"en_AU" ,"british.hash"        ,"iso-8859-1" },
00078     {"en_BZ" ,"british.hash"        ,"iso-8859-1" },
00079     {"en_CA" ,"british.hash"        ,"iso-8859-1" },
00080     {"en_GB" ,"british.hash"        ,"iso-8859-1" },
00081     {"en_IE" ,"british.hash"        ,"iso-8859-1" },
00082     {"en_JM" ,"british.hash"        ,"iso-8859-1" },
00083     {"en_NZ" ,"british.hash"        ,"iso-8859-1" },
00084     {"en_TT" ,"british.hash"        ,"iso-8859-1" },
00085     {"en_ZA" ,"british.hash"        ,"iso-8859-1" },
00086     {"en_ZW" ,"british.hash"        ,"iso-8859-1" },
00087     {"en_PH" ,"american.hash"       ,"iso-8859-1" },
00088     {"en_US" ,"american.hash"       ,"iso-8859-1" },
00089     {"eo"    ,"esperanto.hash"      ,"iso-8859-3" },
00090     {"es"    ,"espanol.hash"        ,"iso-8859-1" },
00091     {"es_AR" ,"espanol.hash"        ,"iso-8859-1" },
00092     {"es_BO" ,"espanol.hash"        ,"iso-8859-1" },
00093     {"es_CL" ,"espanol.hash"        ,"iso-8859-1" },
00094     {"es_CO" ,"espanol.hash"        ,"iso-8859-1" },
00095     {"es_CR" ,"espanol.hash"        ,"iso-8859-1" },
00096     {"es_DO" ,"espanol.hash"        ,"iso-8859-1" },
00097     {"es_EC" ,"espanol.hash"        ,"iso-8859-1" },
00098     {"es_ES" ,"espanol.hash"        ,"iso-8859-1" },
00099     {"es_GT" ,"espanol.hash"        ,"iso-8859-1" },
00100     {"es_HN" ,"espanol.hash"        ,"iso-8859-1" },
00101     {"es_MX" ,"espanol.hash"        ,"iso-8859-1" },
00102     {"es_NI" ,"espanol.hash"        ,"iso-8859-1" },
00103     {"es_PA" ,"espanol.hash"        ,"iso-8859-1" },
00104     {"es_PE" ,"espanol.hash"        ,"iso-8859-1" },
00105     {"es_PR" ,"espanol.hash"        ,"iso-8859-1" },
00106     {"es_PY" ,"espanol.hash"        ,"iso-8859-1" },
00107     {"es_SV" ,"espanol.hash"        ,"iso-8859-1" },
00108     {"es_UY" ,"espanol.hash"        ,"iso-8859-1" },
00109     {"es_VE" ,"espanol.hash"        ,"iso-8859-1" },
00110     {"fi"    ,"finnish.hash"        ,"iso-8859-1" },
00111     {"fi_FI" ,"finnish.hash"        ,"iso-8859-1" },
00112     {"fr"    ,"francais.hash"       ,"iso-8859-1" },
00113     {"fr_BE" ,"francais.hash"       ,"iso-8859-1" },
00114     {"fr_CA" ,"francais.hash"       ,"iso-8859-1" },
00115     {"fr_CH" ,"francais.hash"       ,"iso-8859-1" },
00116     {"fr_FR" ,"francais.hash"       ,"iso-8859-1" },
00117     {"fr_LU" ,"francais.hash"       ,"iso-8859-1" },
00118     {"fr_MC" ,"francais.hash"       ,"iso-8859-1" },
00119     {"hu"    ,"hungarian.hash"      ,"iso-8859-2" },
00120     {"hu_HU" ,"hungarian.hash"      ,"iso-8859-2" },
00121     {"ga"    ,"irish.hash"          ,"iso-8859-1" },
00122     {"ga_IE" ,"irish.hash"          ,"iso-8859-1" },
00123     {"gl"    ,"galician.hash"       ,"iso-8859-1" },
00124     {"gl_ES" ,"galician.hash"       ,"iso-8859-1" },
00125     {"ia"    ,"interlingua.hash"    ,"iso-8859-1" },
00126     {"it"    ,"italian.hash"        ,"iso-8859-1" },
00127     {"it_IT" ,"italian.hash"        ,"iso-8859-1" },
00128     {"it_CH" ,"italian.hash"        ,"iso-8859-1" },
00129     {"la"    ,"mlatin.hash"         ,"iso-8859-1" },
00130     {"la_IT" ,"mlatin.hash"         ,"iso-8859-1" },
00131     {"lt"    ,"lietuviu.hash"       ,"iso-8859-13" },
00132     {"lt_LT" ,"lietuviu.hash"       ,"iso-8859-13" },
00133     {"nl"    ,"nederlands.hash"     ,"iso-8859-1" },
00134     {"nl_NL" ,"nederlands.hash"     ,"iso-8859-1" },
00135     {"nl_BE" ,"nederlands.hash"     ,"iso-8859-1" },
00136     {"nb"    ,"norsk.hash"          ,"iso-8859-1" },
00137     {"nb_NO" ,"norsk.hash"          ,"iso-8859-1" },
00138     {"nn"    ,"nynorsk.hash"        ,"iso-8859-1" },
00139     {"nn_NO" ,"nynorsk.hash"        ,"iso-8859-1" },
00140     {"no"    ,"norsk.hash"          ,"iso-8859-1" },
00141     {"no_NO" ,"norsk.hash"          ,"iso-8859-1" },
00142     {"pl"    ,"polish.hash"         ,"iso-8859-2" },
00143     {"pl_PL" ,"polish.hash"         ,"iso-8859-2" },
00144     {"pt"    ,"brazilian.hash"      ,"iso-8859-1" },
00145     {"pt_BR" ,"brazilian.hash"      ,"iso-8859-1" },
00146     {"pt_PT" ,"portugues.hash"      ,"iso-8859-1" },
00147     {"ru"    ,"russian.hash"        ,"koi8-r" },
00148     {"ru_MD" ,"russian.hash"        ,"koi8-r" },
00149     {"ru_RU" ,"russian.hash"        ,"koi8-r" },
00150     {"sc"    ,"sardinian.hash"      ,"iso-8859-1" },
00151     {"sc_IT" ,"sardinian.hash"      ,"iso-8859-1" },
00152     {"sk"    ,"slovak.hash"         ,"iso-8859-2" },
00153     {"sk_SK" ,"slovak.hash"         ,"iso-8859-2" },
00154     {"sl"    ,"slovensko.hash"      ,"iso-8859-2" },
00155     {"sl_SI" ,"slovensko.hash"      ,"iso-8859-2" },
00156     {"sv"    ,"svenska.hash"        ,"iso-8859-1" },
00157     {"sv_SE" ,"svenska.hash"        ,"iso-8859-1" },
00158     {"uk"    ,"ukrainian.hash"      ,"koi8-u" },
00159     {"uk_UA" ,"ukrainian.hash"      ,"koi8-u" },
00160     {"yi"    ,"yiddish-yivo.hash"   ,"utf-8" }
00161 };
00162 
00163 static const size_t size_ispell_map = ( sizeof(ispell_map) / sizeof((ispell_map)[0]) );
00164 static QMap<QString, QString> ispell_dict_map;
00165 
00166 
00167 void
00168 ISpellChecker::try_autodetect_charset(const char * const inEncoding)
00169 {
00170     if (inEncoding && strlen(inEncoding))
00171         {
00172             m_translate_in = QTextCodec::codecForName(inEncoding);
00173         }
00174 }
00175 
00176 /***************************************************************************/
00177 /***************************************************************************/
00178 
00179 ISpellChecker::ISpellChecker()
00180     : deftflag(-1),
00181      prefstringchar(-1),
00182      m_bSuccessfulInit(false),
00183      m_BC(NULL),
00184      m_cd(NULL),
00185      m_cl(NULL),
00186      m_cm(NULL),
00187      m_ho(NULL),
00188      m_nd(NULL),
00189      m_so(NULL),
00190      m_se(NULL),
00191      m_ti(NULL),
00192      m_te(NULL),
00193      m_hashstrings(NULL),
00194      m_hashtbl(NULL),
00195      m_pflaglist(NULL),
00196      m_sflaglist(NULL),
00197      m_chartypes(NULL),
00198      m_infile(NULL),
00199      m_outfile(NULL),
00200      m_askfilename(NULL),
00201      m_Trynum(0),
00202      m_translate_in(0)
00203 {
00204     memset(m_sflagindex,0,sizeof(m_sflagindex));
00205     memset(m_pflagindex,0,sizeof(m_pflagindex));
00206 }
00207 
00208 #ifndef FREEP
00209 #define FREEP(p)        do { if (p) free(p); } while (0)
00210 #endif
00211 
00212 ISpellChecker::~ISpellChecker()
00213 {
00214     if (m_bSuccessfulInit) {
00215         // only cleanup our mess if we were successfully initialized
00216 
00217         clearindex (m_pflagindex);
00218         clearindex (m_sflagindex);
00219     }
00220 
00221     FREEP(m_hashtbl);
00222     FREEP(m_hashstrings);
00223     FREEP(m_sflaglist);
00224     FREEP(m_chartypes);
00225 
00226     delete m_translate_in;
00227     m_translate_in = 0;
00228 }
00229 
00230 bool
00231 ISpellChecker::checkWord( const QString& utf8Word )
00232 {
00233     ichar_t iWord[INPUTWORDLEN + MAXAFFIXLEN];
00234     if (!m_bSuccessfulInit)
00235         return false;
00236 
00237     if (!utf8Word || utf8Word.length() >= (INPUTWORDLEN + MAXAFFIXLEN) || utf8Word.isEmpty())
00238         return false;
00239 
00240     bool retVal = false;
00241     QCString out;
00242     if (!m_translate_in)
00243         return false;
00244     else {
00245         /* convert to 8bit string and null terminate */
00246         int len_out = utf8Word.length();
00247 
00248         out = m_translate_in->fromUnicode( utf8Word, len_out );
00249     }
00250 
00251     if (!strtoichar(iWord, out.data(), INPUTWORDLEN + MAXAFFIXLEN, 0))
00252         {
00253             if (good(iWord, 0, 0, 1, 0) == 1 ||
00254                 compoundgood(iWord, 1) == 1)
00255                 {
00256                     retVal = true;
00257                 }
00258         }
00259 
00260     return retVal;
00261 }
00262 
00263 QStringList
00264 ISpellChecker::suggestWord(const QString& utf8Word)
00265 {
00266     ichar_t  iWord[INPUTWORDLEN + MAXAFFIXLEN];
00267     int  c;
00268 
00269     if (!m_bSuccessfulInit)
00270         return QStringList();
00271 
00272     if (utf8Word.isEmpty() || utf8Word.length() >= (INPUTWORDLEN + MAXAFFIXLEN) ||
00273             utf8Word.length() == 0)
00274         return QStringList();
00275 
00276     QCString out;
00277     if (!m_translate_in)
00278         return QStringList();
00279     else
00280         {
00281             /* convert to 8bit string and null terminate */
00282 
00283             int len_out = utf8Word.length();
00284             out = m_translate_in->fromUnicode( utf8Word, len_out );
00285         }
00286 
00287     if (!strtoichar(iWord, out.data(), INPUTWORDLEN + MAXAFFIXLEN, 0))
00288         makepossibilities(iWord);
00289     else
00290         return QStringList();
00291 
00292     QStringList sugg_arr;
00293     for (c = 0; c < m_pcount; c++)
00294     {
00295         QString utf8Word;
00296 
00297         if (!m_translate_in)
00298         {
00299             /* copy to 8bit string and null terminate */
00300             utf8Word = QString::fromUtf8( m_possibilities[c] );
00301         }
00302         else
00303         {
00304             /* convert to 32bit string and null terminate */
00305             utf8Word = m_translate_in->toUnicode( m_possibilities[c] );
00306         }
00307 
00308         sugg_arr.append( utf8Word );
00309     }
00310 
00311     return sugg_arr;
00312 }
00313 
00314 static void
00315 s_buildHashNames (std::vector<std::string> & names, const char * dict)
00316 {
00317     const char * tmp = 0;
00318     int i = 0;
00319 
00320     names.clear ();
00321 
00322     while ( (tmp = ispell_dirs[i++]) ) {
00323         QCString maybeFile = QCString( tmp ) + '/';
00324         maybeFile += dict;
00325         names.push_back( maybeFile.data() );
00326     }
00327 }
00328 
00329 static void
00330 s_allDics()
00331 {
00332     const char * tmp = 0;
00333     int i = 0;
00334 
00335     while ( (tmp = ispell_dirs[i++]) ) {
00336         QDir dir( tmp );
00337         QStringList lst = dir.entryList( "*.hash" );
00338         for ( QStringList::Iterator it = lst.begin(); it != lst.end(); ++it ) {
00339             QFileInfo info( *it );
00340             for (size_t i = 0; i < size_ispell_map; i++)
00341             {
00342                 const IspellMap * mapping = (const IspellMap *)(&(ispell_map[i]));
00343                 if (!strcmp (info.fileName().latin1(), mapping->dict))
00344                 {
00345                     ispell_dict_map.insert( mapping->lang, *it );
00346                 }
00347             }
00348         }
00349     }
00350 }
00351 
00352 QValueList<QString>
00353 ISpellChecker::allDics()
00354 {
00355     if ( ispell_dict_map.empty() )
00356         s_allDics();
00357 
00358     return ispell_dict_map.keys();
00359 }
00360 
00361 QString
00362 ISpellChecker::loadDictionary (const char * szdict)
00363 {
00364     std::vector<std::string> dict_names;
00365 
00366     s_buildHashNames (dict_names, szdict);
00367 
00368     for (size_t i = 0; i < dict_names.size(); i++)
00369         {
00370             if (linit(const_cast<char*>(dict_names[i].c_str())) >= 0)
00371                 return dict_names[i].c_str();
00372         }
00373 
00374     return QString::null;
00375 }
00376 
00383 bool
00384 ISpellChecker::loadDictionaryForLanguage ( const char * szLang )
00385 {
00386     QString hashname;
00387 
00388     const char * encoding = NULL;
00389     const char * szFile = NULL;
00390 
00391     for (size_t i = 0; i < size_ispell_map; i++)
00392         {
00393             const IspellMap * mapping = (const IspellMap *)(&(ispell_map[i]));
00394             if (!strcmp (szLang, mapping->lang))
00395                 {
00396                     szFile = mapping->dict;
00397                     encoding = mapping->enc;
00398                     break;
00399                 }
00400         }
00401 
00402     if (!szFile || !strlen(szFile))
00403         return false;
00404 
00405     alloc_ispell_struct();
00406 
00407     hashname = loadDictionary(szFile);
00408     if (hashname.isEmpty())
00409         return false;
00410 
00411     // one of the two above calls succeeded
00412     setDictionaryEncoding (hashname, encoding);
00413 
00414     return true;
00415 }
00416 
00417 void
00418 ISpellChecker::setDictionaryEncoding( const QString& hashname, const char * encoding )
00419 {
00420     /* Get Hash encoding from XML file. This should always work! */
00421     try_autodetect_charset(encoding);
00422 
00423     if (m_translate_in)
00424         {
00425             /* We still have to setup prefstringchar*/
00426             prefstringchar = findfiletype("utf8", 1, deftflag < 0 ? &deftflag
00427                               : static_cast<int *>(NULL));
00428 
00429             if (prefstringchar < 0)
00430                 {
00431                     std::string teststring;
00432                     for(int n1 = 1; n1 <= 15; n1++)
00433                         {
00434                             teststring = "latin" + n1;
00435                             prefstringchar = findfiletype(teststring.c_str(), 1,
00436                                               deftflag < 0 ? &deftflag : static_cast<int *>(NULL));
00437                             if (prefstringchar >= 0)
00438                                 break;
00439                         }
00440                 }
00441 
00442             return; /* success */
00443         }
00444 
00445     /* Test for UTF-8 first */
00446     prefstringchar = findfiletype("utf8", 1, deftflag < 0 ? &deftflag : static_cast<int *>(NULL));
00447     if (prefstringchar >= 0)
00448         {
00449             m_translate_in = QTextCodec::codecForName("utf8");
00450         }
00451 
00452     if (m_translate_in)
00453         return; /* success */
00454 
00455     /* Test for "latinN" */
00456     if (!m_translate_in)
00457         {
00458             /* Look for "altstringtype" names from latin1 to latin15 */
00459             for(int n1 = 1; n1 <= 15; n1++)
00460                 {
00461                     QString teststring = QString("latin%1").arg(n1);
00462                     prefstringchar = findfiletype(teststring.latin1(), 1,
00463                                       deftflag < 0 ? &deftflag : static_cast<int *>(NULL));
00464                     if (prefstringchar >= 0)
00465                         {
00466                             //FIXME: latin1 might be wrong
00467                             m_translate_in = QTextCodec::codecForName( teststring.latin1() );
00468                             break;
00469                         }
00470                 }
00471         }
00472 
00473     /* If nothing found, use latin1 */
00474     if (!m_translate_in)
00475         {
00476             m_translate_in = QTextCodec::codecForName("latin1");
00477         }
00478 }
00479 
00480 bool
00481 ISpellChecker::requestDictionary(const char *szLang)
00482 {
00483     if (!loadDictionaryForLanguage (szLang))
00484         {
00485             // handle a shortened version of the language tag: en_US => en
00486             std::string shortened_dict (szLang);
00487             size_t uscore_pos;
00488 
00489             if ((uscore_pos = shortened_dict.rfind ('_')) != ((size_t)-1)) {
00490                 shortened_dict = shortened_dict.substr(0, uscore_pos);
00491                 if (!loadDictionaryForLanguage (shortened_dict.c_str()))
00492                     return false;
00493             } else
00494                 return false;
00495         }
00496 
00497     m_bSuccessfulInit = true;
00498 
00499     if (prefstringchar < 0)
00500         m_defdupchar = 0;
00501     else
00502         m_defdupchar = prefstringchar;
00503 
00504     return true;
00505 }
KDE Home | KDE Accessibility Home | Description of Access Keys