1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 """Module to guess the language ISO code based on the 'Language-Team entry in
22 the header of a Gettext PO file."""
23
24 import re
25
26 from translate.misc.typecheck import accepts, returns, IsOneOf
27 from translate.misc.typecheck.typeclasses import String
28
29 __all__ = ['LANG_TEAM_CONTACT_SNIPPETS', 'guess_language']
30
31 LANG_TEAM_REGEX = (
32 ("@li.org", "([a-z_A-Z]{2,})@li.org", ["LL", "XX", "TEAM"]),
33 ("translation-team", "translation-team-([a-z_A-Z]+)@lists.sourceforge.net", None),
34 ("fedora-trans", "fedora-trans-([a-z_A-Z]+)@redhat.com", ["list"]),
35 ("ubuntu-l10n", "ubuntu-l10n-([a-z_A-Z]+)@lists.ubuntu.com", None),
36 ("translate-discuss", "translate-discuss-([a-z_A-Z]+)@lists.sourceforge.net", None),
37 ("kde-i18n", "kde-i18n-([a-z_A-Z]+)@(?:lists\.|mail\.|)kde.org", ["doc"]),
38 ("kde-l10n", "kde-l10n-([a-z_A-Z]+)@kde.org", None),
39 ("fedoraproject", "trans-([a-z_A-Z]+)@lists.fedoraproject.org", None),
40 ("gnome.org", "gnome-([a-z_A-Z]+)-list@gnome.org", ["latin"]),
41 )
42 """Data for regular expression based extraction. The fieds are: prefilter information,
43 regex with single group that contains the language code, postfilter."""
44
45 LANG_TEAM_CONTACT_SNIPPETS = {
46 "af": ("i18n@af.org.za", "Petri Jooste",),
47 "am": ("@geez.org", ),
48 "ar": ("arabeyes.org", "Arabeyes", ),
49 "as": ("assam@mm.assam-glug.org", ),
50 "ast": ("@softastur.org", "launchpad.net/~ubuntu-l10n-ast", "softast-xeneral@lists.sourceforge.net", "Softastur",),
51 "az": ("linuxaz@azerimal.net", "gnome@azitt.com", u"gnome@azətt.com",),
52 "az_IR": ("az-ir@lists.sharif.edu",),
53 "be": ("i18n@mova.org", "i18n@tut.by", "mozilla_byx@poczta.fm",),
54 "be@latin": ("translation-team-be-latin@lists", "be-latin.open-tran.eu",),
55 "bg": ("dict@fsa-bg.org", "dict@linux.zonebg.com", ),
56 "bn": ("gnome-translation@bengalinux.org", "core@bengalinux.org", "ankur-bd-l10n@googlegroups.com", "redhat-translation@bengalinux.org", ),
57 "bn_IN": ("anubad@lists.ankur.org.in", ),
58 "br": ("drouizig@drouizig.org", "brenux@free.fr", "tradgnome@softcatala.net", "fedora@softcatala.org", ),
59 "bs": ("lokal@linux.org.ba", "lokal@lugbih.org", ),
60 "ca": ("@softcatala.org",),
61 "crh": ("tilde-birlik-tercime@lists.sourceforge.net", ),
62 "cs": ("fedora-cs-list@redhat.com", "cs-users@lists.fedoraproject.org", "debian-l10n-czech@lists.debian.org",
63 "kde-czech-apps@lists.sourceforge.net", "kde-czech-apps@lists.sf.net", "translations.cs@gnupg.cz"),
64 "cy": ("gnome-cy@lists.linux.org.uk", "gnome-cy@pengwyn.linux.org.uk", "gnome-cy@www.linux.org", "gnome-cy@www.linux.org.uk", "cy@pengwyn.linux.org.uk", ),
65 "da": ("dansk@dansk-gruppen.dk", "dansk@klid.dk", "sslug-locale@sslug.dk", ),
66 "de": ("gnome-de@gnome.org", "debian-l10n-german@lists.debian.org", ),
67 "dz": ("pgeyleg@dit.gov.bt", "pgyeleg@dit.gov.bt", ),
68 "el": ("debian-l10n-greek@lists.debian.org", "i18ngr@lists.hellug.gr", "i18n@hellug.gr", "nls@tux.hellug.gr", "team@gnome.gr", "team@lists.gnome.gr", "users@el.openoffice.org", ),
69 "en_AU": ("trans@six-by-nine.com.au", ),
70 "en_CA": ("adamw@gnome.org", "adamw@freebsd.org", ),
71 "en_GB": ("kde-en-gb@kde.me.uk", ),
72 "en@shaw": ("ubuntu-l10n-en-shaw@launchpad.net", "ubuntu-l10n-en-shaw@lists.launchpad.net", ),
73 "eo": ("eo-tradukado@lists.tuxfamily.org", "debian-l10n-esperanto@lists.debian.org", "ubuntu-l10n-eo@lists.launchpad.net", "eo-tradukado.tuxfamily.org", ),
74 "es": ("pgsql-es-ayuda@postgresql.org", "debian-l10n-spanish@lists.debian.org", "gnome-es@gnome.org", "traductores@es.gnome.org", ),
75 "et": ("gnome-et@linux.ee", "kde-et@linux.ee", "linux-ee@lists.eenet.ee", "linux-et@lists.eenet.ee", "et-gnome@linux.ee", "linux-ee@eenet.ee", ),
76 "eu": ("debian-l10n-basque@lists.debian.org", "debian-l10n-eu@lists.debian.org", "itzulpena@euskalgnu.org", "gnome@euskalgnu.org", "librezale@librezale.org", "linux-eu@chanae.alphanet.ch", ),
77 "fa": ("farsi@lists.sharif.edu", "Farsiweb.info", ),
78 "fi": ("debian-l10n-finnish@lists.debian.org", "gnome-fi-laatu@lists.sourceforge.net", "laatu@lokalisointi.org",
79 "lokalisointi-laatu@linux-aktivaattori.org", "laatu@gnome.fi", "yast-trans-fi@kotoistaminen.novell.fi", ),
80 "fr": ("debian-l10n-french@lists.debian.org", "gnomefr@traduc.org", "kde-francophone@kde.org", "traduc@traduc.org", "pgsql-fr-generale@postgresql.org", "rpm-fr@livna.org", ),
81 "ga": ("gaeilge-gnulinux@lists.sourceforge.net", "gaeilge-a@listserv.heanet.ie", ),
82 "gl": ("trasno@ceu.fi.udc.es", "gnome@g11n.net", "gpul-traduccion@ceu.fi.udc.es", "proxecto@trasno.net", "trasno@gpul.org", ),
83 "gu": ("indianoss-gujarati@lists.sourceforge.net", ),
84 "he": ("debian-hebrew-common@lists.alioth.debian.org", "kde-il@yahoogroups.com", "fedora-he-list@redhat.com", "mdk-hebrew@iglu.org.il", ),
85 "hi": ("indlinux-hindi-gnome@lists.sourceforge.net", "indlinux-hindi@lists.sourceforge.net", ),
86 "hr": ("translator-shop.org", "lokalizacija@linux.hr", ),
87 "hu": ("debian-l10n-hungarian@lists.debian.org", "gnome@fsf.hu", "gnome@gnome.hu", "magyar@lists.linux.hu", ),
88 "id": ("@id.gnome.org", "@gnome.linux.or.id", "mdk-id@yahoogroups.com", "linux.or.id", "gnome@i15n.org"),
89 "io": ("gnome-ido@lists.mterry.name", ),
90 "is": ("gnome@techattack.nu", "kde-isl@mmedia.is", "kde-isl@molar.is", ),
91 "it": ("debian-l10n-italian@lists.debian.org", "traduzioni@itpug.org", "fedora-trans-it@redhat.com", "tp@lists.linux.it", ),
92 "ja": ("debian-doc@debian.or.jp", "debian-japanese@lists.debian.org", "gnome-translation@gnome.gr.jp", "translation@gnome.gr.jp", "jpug-doc@ml.postgresql.jp", ),
93 "ka": ("geognome@googlegroups.com", "Ubuntu-Georgian-Translators@googlegroups.com", ),
94 "kk": ("kk_KZ@googlegroups.com", ),
95 "km": ("@khmeros.info", ),
96 "kn": ("debian-l10n-kannada@lists.debian.org", ),
97 "ko": ("gnome-kr-hackers@list.kldp.net", "gnome-kr-hackers@lists.kldp.net", "gnome-kr-translation@lists.kldp.net", "pgsql-kr@postgresql.or.kr",
98 "hangul-hackers@lists.kldp.net", "debian-l10n-korean@lists.debian.org", "gnome-kr-translation@lists.sourceforge.net", ),
99 "ks": ("ks-gnome-trans-commits@lists.code.indlinux.net", ),
100 "ku": ("gnu-ku-wergerandin@lists.sourceforge.net", ),
101 "ky": ("i18n-team-ky-kyrgyz@lists.sourceforge.net", "ky-li@mail.ru", ),
102 "la": ("gnome-latin-list@gnome.org", ),
103 "li": ("li@gnome.org", ),
104 "lt": ("gimp-lt@lists.akl.lt", "gnome-lt@lists.akl.lt", "gnome-lt@lists.gnome.org", "komp_lt@konferencijos.lt", ),
105 "lv": ("lata-l10n@googlegroups.com", "lata-i18n@groups.google.com", "locale@laka.lv", "ll10nt@os.lv", ),
106 "mai": ("maithili.sf.net", ),
107 "mg": ("i18n-malagasy-gnome@gnome.org", ),
108 "mi": ("maori@nzlinux.org.nz", ),
109 "mk": ("gnomk-main@lists.sourceforge.net", "lug@lists.linux.net.mk", "mkde-l10n@lists.sourceforge.net", "ossm-members@hedona.on.net.mk", ),
110 "ml": ("smc-discuss@googlegroups.com", ),
111 "mn": ("openmn-", "openmn.org", ),
112 "ms": ("gabai-penyumbang@lists.sourceforge.net", "gabai-penyumbang@lists.sf.net", "kedidiemas@yahoogroups.com", ),
113 "nb": ("i18n-nb@lister.ping.uio.no", ),
114 "nds": ("nds-lowgerman@lists.sourceforge.net", ),
115 "ne": ("info@mpp.org.np", ),
116 "nl": ("debian-l10n-dutch@lists.debian.org", "vertaling@nl.gnome.org", "vertaling@vrijschrift.org", "nl@vrijschrift.org", "vertaling@nl.linux.org", "vertaling@nl.li.org", ),
117 "nn": ("i18n-nn@lister.ping.uio.no", ),
118 "nso": ("sepedi@translate.org.za", ),
119 "or": ("oriya-group@lists.sarovar.org", "oriya-it@googlegroups.com", ),
120 "pa": ("punjabi-l10n@users.sf.net", "fedora-pa-list@redhat.com", "punjabi-users@lists.sf.net", "punjabi-l10n@lists.sourceforge.net", "punlinux-i18n@lists.sourceforge.net", ),
121 "pl": ("gnomepl@aviary.pl", "debian-l10n-polish@lists.debian.org", "gnome-l10n@lists.aviary.pl", "translators@gnomepl.org", ),
122 "ps": ("pathanisation@googelgroups.com", ),
123 "pt": ("fedora-trans-pt@redhat.org", "gnome_pt@yahoogroups.com", "traduz@debianpt.org", "traduz@debian.pt", ),
124 "pt_BR": ("gnome-l10n-br@listas.cipsga.org.br", "gnome-pt_br-list@gnome.org", "fedora-docs-br@redhat.com", "fedora-trans-pt-br@redhat.com",
125 "ldp-br@bazar.conectiva.com.br", "pgbr-dev@postgresql.org.br", "pgbr-dev@listas.postgresql.org.br", "debian-l10n-portuguese@lists.debian.org", ),
126 "ro": ("fedora-ro@googlegroups.com", "gnomero-list@lists.sourceforge.net", "debian-l10n-romanian@lists.debian.org", ),
127 "ru": ("pgsql-rus@yahoogroups.com", "debian-l10n-russian@lists.debian.org", "gnupg-ru@gnupg.org", ),
128 "sk": ("sk-i18n@lists.linux.sk", "kde-sk@linux.sk", ),
129 "sl": ("gnome-si@googlegroups.com", ),
130 "sq": ("gnome-albanian-perkthyesit@lists.sourceforge.net", "debian-l10n-albanian@lists.debian.org", ),
131 "sr": ("@prevod.org", "serbiangnome-lista@nongnu.org", ),
132 "sv": ("debian-l10n-swedish@lists.debian.org", "tp-sv@listor.tp-sv.se", ),
133 "ta": ("gnome-tamil-translation@googlegroups.com", "tamilinix@yahoogroups.com", "Ubuntu-l10n-tam@lists.ubuntu.com", "tamil-DI@yahoogroups.com", ),
134 "te": ("localisation@swecha.org", "indlinux-telugu@lists.sourceforge.net", ),
135 "th": ("l10n@opentle.org", "thai-l10n@googlegroup.com", "thailang@buraphalinux.org", "thai-l10n@googlegroups.com", "l10n.opentle.org", ),
136 "tk": ("kakilikgroup@yahoo.com", ),
137 "tl": ("debian-tl@banwa.upm.edu.ph", ),
138 "tr": ("debian-l10n-turkish@lists.debian.org", "gnome-turk@gnome.org", "gnu-tr-u12a@lists.sourceforge.net", "turkce@pardus.org.tr", ),
139 "tt": ("tatarish.l10n@gmail.com", ),
140 "ug": ("gnome-uighur@yahoogroups.com", ),
141 "uk": ("linux@linux.org.ua", ),
142 "ur": ("l10n@urduweb.org", "urdu.scs.gift@gmail.com", ),
143 "ve": ("venda@translate.org.za", ),
144 "vi": ("gnomevi-list@lists.sourceforge.net", "vi-VN@googlegroups.com", ),
145 "wa": ("linux-wa@", ),
146 "xh": ("xh-translate@ubuntu.com", "xhosa@translate.org.za", "xhosa@ubuntu.com", ),
147 "zh_CN": ("i18n-translation@lists.linux.net.cn", "i18n-zh@googlegroups.com", "translation-team-zh-cn@lists.sourceforge.net", "i18n-zh@googlegroup.com", ),
148 "zh_TW": ("zh-l10n@lists.linux.org.tw", "chinese-l10n@googlegroups.com", "community@linuxhall.org", "zh-l10n@linux.org.tw", ),
149 "zu": ("zulu@translate.org.za", ),
150 }
151 """Language codes with snippets of contact information that can be used to
152 uniquely identify the language"""
153
154 LANG_TEAM_LANGUAGE_SNIPPETS = {
155 "af": ("Afrikaans",),
156 "am": ("Amharic",),
157 "ang": ("Old English",),
158 "ar": ("Arabic", ),
159 "as": ("Assamese", ),
160 "ast": ("Asturian", ),
161 "az": ("Azerbaijani", u"Azərbaycan", ),
162 "bg": ("Bulgarian", ),
163 "be@latin": ("Belarusian Latin", ),
164 "be": ("Belarusian", "Belorussian", ),
165 "bn_IN": ("Bengali (India)", "Bengali INDIA", "Bengali India", ),
166 "bn": ("Bangladeshi", "Bengali", ),
167 "br": ("Breton", "Britton", ),
168 "bs": ("Bosanski", "Bosnian", ),
169 "byn": ("Blin", ),
170 "ca": ("Catalan", ),
171 "ckb": ("Kurdish (Sorani)", ),
172 "crh": ("Crimean Tatar", "Crimean Turkish", ),
173 "cs": ("Czech", ),
174 "cy": ("Cymru", "Welsh", ),
175 "da": ("Danish", "Dansk", ),
176 "de": ("Deutsch", "German", ),
177 "dz": ("Dzongkha", ),
178 "el": ("Greek", ),
179 "en_GB": ("British English", "en_GB", "English (Great Britain)", ),
180 "eo": ("Esperanto", ),
181 "es": ("Spanish", "es_ES", u"Español", ),
182 "et": ("Eesti", "Estonian", ),
183 "eu": ("Basque", "Euskara", ),
184 "fa": ("Persian", ),
185 "fi": ("Finnish", "Suomi", ),
186 "fo": ("Faroese", ),
187 "fr": ("French", u"Français", ),
188 "fur": ("Friulian", ),
189 "ga": ("Irish", ),
190 "gez": ("Geez", ),
191 "gl": ("Galego", "Galician", "Gallegan", "gl_ES", ),
192 "gu": ("Gujarati", ),
193 "haw": ("Hawaiian", ),
194 "he": ("Hebrew", ),
195 "hi": ("Hindi", ),
196 "hr": ("Croatian", ),
197 "hu": ("Hungarian", ),
198 "hy": ("Armenian", ),
199 "ia": ("Interlingua", ),
200 "id": ("Bahasa Indonesia", "Indonesia", "Indonesian", ),
201 "ig": ("Igbo", ),
202 "is": ("Icelandic", ),
203 "it": ("Italian", ),
204 "ja": ("Japanese", ),
205 "ka": ("Georgian", ),
206 "kk": ("Kazakh", ),
207 "km": ("Khmer", ),
208 "kn": ("Kannada", ),
209 "ko": ("Korean", "Hangul", ),
210 "kok": ("Konkani", ),
211 "ks": ("Kashmiri", ),
212 "ku": ("Kurdish", ),
213 "ky": ("Kitghiz", "Kirghiz", ),
214 "lg": ("Luganda", ),
215 "li": ("Limburgish", ),
216 "lt": ("Lithuanian", ),
217 "lv": ("Latvian", "lv_LV", "Valoda", u"Latviešu", ),
218 "mal": ("Malayalam", ),
219 "mg": ("Malagasy", ),
220 "mi": ("Maori", ),
221 "mk": ("Macedonian", ),
222 "ml": ("Malayalam", ),
223 "mn": ("Mongolian", ),
224 "mt": ("Marathi", ),
225 "ms": ("Malay", "Bahasa Melayu", ),
226 "my": ("Burmese", ),
227 "nb": ("Norwegian Bokmaal", u"Norsk bokmål", u"Norwegian Bokmål", u"Norwegian bokmål", ),
228 "nds": ("Low Saxon", ),
229 "nl": ("Dutch", "Nederlands", ),
230 "nn": ("Norwegian nynorsk", "Nynorsk", ),
231 "oc": ("Occitan", ),
232 "or": ("Oriya", ),
233 "pa": ("Punjabi", "Panjabi", ),
234 "pl": ("Polish", ),
235 "ps": ("Pashto", "Pushto", ),
236 "pt_BR": ("Brazilian Portuguese", u"Português/Brasil", u"Português do Brasil", ),
237 "pt": ("Portuguese", ),
238 "rm": ("Rhaeto-Romance", ),
239 "ro": ("Romania", "Romanian", u"Română", ),
240 "ru": ("Russian", ),
241 "si": ("Sinhala", "Sinhalese", ),
242 "sk": ("Slovak", ),
243 "sl": ("Slovene", "Slovenian", ),
244 "so": ("Somali", ),
245 "sq": ("Albanian", ),
246 "sr": ("Serbian", ),
247 "sv": ("Swedish", ),
248 "sw": ("Swahili", ),
249 "ta": ("Tamil", ),
250 "te": ("Telugu", ),
251 "tet": ("Tetum", ),
252 "tg": ("Tajik", ),
253 "th": ("Thai", ),
254 "ti": ("Tigrinya", ),
255 "tig": ("Tigre", ),
256 "tl": ("Tagalog", ),
257 "tr": ("Turkish", u"Türkçe", u"Türkiye", ),
258 "tt": ("Tatarish", ),
259 "ug": ("Uighur", ),
260 "uk": ("Ukrainian", ),
261 "ur": ("Urdu", ),
262 "uz": ("Uzbek", ),
263 "ve": ("Venda", u"Tshivenḓa", "Tshivenda", ),
264 "vi": ("Vietnamese", ),
265 "wa": ("Walloon", ),
266 "wal": ("Walamo", ),
267 "wo": ("Wolof", ),
268 "xh": ("Xhosa", "IsiXhosa", "isiXhosa", ),
269 "yi": ("Yiddish", ),
270 "yo": ("Yoruba", ),
271 "zh_CN": ("Chinese Simplified", "Chinese/Simplified", "Chinese (simplified)", "Simplified Chinese", ),
272 "zh_HK": ("Chinese (Hong Kong)", ),
273 "zh_TW": ("Chinese (traditional)", "Chinese/Traditional", "Traditional Chinese", ),
274 }
275 """Language codes with snippets of language names, including English, native
276 spelling and varients, that can be used to uniquely identify the language"""
279 """Use regular expressions to extract the language team
280
281 @param prefilter: simple filter to apply before attempting the regex
282 @param regex: regular expression with one group that will contain
283 the language code
284 @param string: the language team string that should be examined
285 @param postfilter: filter to apply to reject any potential matches
286 after they have been retreived by the regex
287 @return: ISO language code for the found language
288 """
289
290
291
292
293
294 if prefilter in string:
295 found = re.search(regex, string)
296 if found:
297 regex_lang = found.groups()[0]
298 else:
299 return None
300 if postfilter is not None and regex_lang in postfilter:
301 return None
302 if regex_lang and regex_lang != 'en':
303 return regex_lang
304 return None
305
307 """Return the supplied text unchanged"""
308 return text
309
312 """Convert the supplied text to lowercase"""
313 return text.lower()
314
317 """Guess the language based on a snippet of text in the language team
318 string.
319
320 @param snippets_dict: A dict of snippets that can be used to identify a
321 language in the format {'lang': ('snippet1', 'snippet2'), 'lang2'...}
322 @param string: The language string to be analysed
323 @param filter_: a function to be applied to the string and snippets
324 before examination
325 """
326 string = filter_(string)
327 for possible_lang, snippets in snippets_dict.iteritems():
328 for snippet in snippets:
329 if filter_(snippet) in string:
330 return possible_lang
331 return None
332
333
334 @accepts(unicode)
335 @returns(IsOneOf(String, type(None)))
336 -def guess_language(team_string):
356