1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 """This module contains all the common features for languages.
23
24 Supported features
25 ==================
26 - language code (km, af)
27 - language name (Khmer, Afrikaans)
28 - Plurals
29 - Number of plurals (nplurals)
30 - Plural equation
31 - pofilter tests to ignore
32
33 Segmentation
34 ------------
35 - characters
36 - words
37 - sentences
38
39 TODOs and Ideas for possible features
40 =====================================
41 - Language-Team information
42 - Segmentation
43 - phrases
44
45 Punctuation
46 -----------
47 - End of sentence
48 - Start of sentence
49 - Middle of sentence
50 - Quotes
51 - single
52 - double
53
54 - Valid characters
55 - Accelerator characters
56 - Special characters
57 - Direction (rtl or ltr)
58 """
59
60 import re
61
62 from translate.lang import data
63
64
66 """This class is the common parent class for all language classes."""
67
68 code = ""
69 """The ISO 639 language code, possibly with a country specifier or other
70 modifier.
71
72 Examples::
73 km
74 pt_BR
75 sr_YU@Latn
76 """
77
78 fullname = ""
79 """The full (English) name of this language.
80
81 Dialect codes should have the form of
82 - Khmer
83 - Portugese (Brazil)
84 - TODO: sr_YU@Latn?
85 """
86
87 nplurals = 0
88 """The number of plural forms of this language.
89
90 0 is not a valid value - it must be overridden.
91 Any positive integer is valid (it should probably be between 1 and 6)
92 @see: L{data}
93 """
94
95 pluralequation = "0"
96 """The plural equation for selection of plural forms.
97
98 This is used for PO files to fill into the header.
99 @see: U{Gettext manual<http://www.gnu.org/software/gettext/manual/html_node/gettext_150.html#Plural-forms>}
100 @see: L{data}
101 """
102
103
104
105 listseperator = u", "
106 """This string is used to separate lists of textual elements. Most
107 languages probably can stick with the default comma, but Arabic and some
108 Asian languages might want to override this."""
109
110 commonpunc = u".,;:!?-@#$%^*_()[]{}/\\'`\"<>"
111 """These punctuation marks are common in English and most languages that
112 use latin script."""
113
114 quotes = u"‘’‛“”„‟′″‴‵‶‷‹›«»"
115 """These are different quotation marks used by various languages."""
116
117 invertedpunc = u"¿¡"
118 """Inveted punctuation sometimes used at the beginning of sentences in
119 Spanish, Asturian, Galician, and Catalan."""
120
121 rtlpunc = u"،؟؛÷"
122 """These punctuation marks are used by Arabic and Persian, for example."""
123
124 CJKpunc = u"。、,;!?「」『』【】"
125 """These punctuation marks are used in certain circumstances with CJK
126 languages."""
127
128 indicpunc = u"।॥॰"
129 """These punctuation marks are used by several Indic languages."""
130
131 ethiopicpunc = u"።፤፣"
132 """These punctuation marks are used by several Ethiopic languages."""
133
134 miscpunc = u"…±°¹²³·©®×£¥€"
135 """The middle dot (·) is used by Greek and Georgian."""
136
137 punctuation = u"".join([commonpunc, quotes, invertedpunc, rtlpunc, CJKpunc,\
138 indicpunc, ethiopicpunc, miscpunc])
139 """We include many types of punctuation here, simply since this is only
140 meant to determine if something is punctuation. Hopefully we catch some
141 languages which might not be represented with modules. Most languages won't
142 need to override this."""
143
144 sentenceend = u".!?…։؟।。!?።"
145 """These marks can indicate a sentence end. Once again we try to account
146 for many languages. Most langauges won't need to override this."""
147
148
149
150
151
152 sentencere = re.compile(r"""(?s) #make . also match newlines
153 .*? #anything, but match non-greedy
154 [%s] #the puntuation for sentence ending
155 \s+ #the spacing after the puntuation
156 (?=[^a-z\d])#lookahead that next part starts with caps
157 """ % sentenceend, re.VERBOSE)
158
159 puncdict = {}
160 """A dictionary of punctuation transformation rules that can be used by
161 punctranslate()."""
162
163 ignoretests = []
164 """List of pofilter tests for this language that must be ignored."""
165
166 checker = None
167 """A language specific checker (see filters.checks).
168
169 This doesn't need to be supplied, but will be used if it exists."""
170
171 _languages = {}
172
173 validaccel = None
174 """Characters that can be used as accelerators (access keys) i.e. Alt+X
175 where X is the accelerator. These can include combining diacritics as
176 long as they are accessible from the users keyboard in a single keystroke,
177 but normally they would be at least precomposed characters. All characters,
178 lower and upper, are included in the list."""
179
180 validdoublewords = []
181 """Some languages allow double words in certain cases. This is a dictionary
182 of such words."""
183
205
207 memo[id(self)] = self
208 return self
209
211 """Give a simple string representation without address information to
212 be able to store it in text for comparison later."""
213 detail = ""
214 if self.code:
215 detail = "(%s)" % self.code
216 return "<class 'translate.lang.common.Common%s'>" % detail
217
242 punctranslate = classmethod(punctranslate)
243
245 """Returns an estimate to a likely change in length relative to an
246 English string of length length."""
247
248
249 expansion_factor = 0
250 code = cls.code
251 while code:
252 expansion_factor = data.expansion_factors.get(cls.code, 0)
253 if expansion_factor:
254 break
255 code = data.simplercode(code)
256 else:
257 expansion_factor = 0.1
258 constant = max(5, int(40*expansion_factor))
259
260 return constant + int(expansion_factor * length)
261 length_difference = classmethod(length_difference)
262
264 """Converts the given string by adding or removing characters as an
265 estimation of translation length (with English assumed as source
266 language)."""
267
268 def alter_it(text):
269 l = len(text)
270 if l > 9:
271 extra = cls.length_difference(l)
272 if extra > 0:
273 text = text[:extra].replace(u'\n', u'') + text
274 else:
275 text = text[-extra:]
276 return text
277 expanded = []
278 for subtext in text.split(u"\n\n"):
279 expanded.append(alter_it(subtext))
280 text = u"\n\n".join(expanded)
281 return text
282 alter_length = classmethod(alter_length)
283
285 """Returns an iterator over the characters in text."""
286
287 prev = 'A'
288 for c in text:
289 if c.isspace() and prev.isspace():
290 continue
291 prev = c
292 if not (c in cls.punctuation):
293 yield c
294 character_iter = classmethod(character_iter)
295
299 characters = classmethod(characters)
300
302 """Returns an iterator over the words in text."""
303
304 for w in text.split():
305 word = w.strip(cls.punctuation)
306 if word:
307 yield word
308 word_iter = classmethod(word_iter)
309
311 """Returns a list of words in text."""
312 return [w for w in cls.word_iter(text)]
313 words = classmethod(words)
314
316 """Returns an iterator over the sentences in text."""
317 lastmatch = 0
318 text = text or ""
319 for item in cls.sentencere.finditer(text):
320 lastmatch = item.end()
321 sentence = item.group()
322 if strip:
323 sentence = sentence.strip()
324 if sentence:
325 yield sentence
326 remainder = text[lastmatch:]
327 if strip:
328 remainder = remainder.strip()
329 if remainder:
330 yield remainder
331 sentence_iter = classmethod(sentence_iter)
332
334 """Returns a list of senteces in text."""
335 return [s for s in cls.sentence_iter(text, strip=strip)]
336 sentences = classmethod(sentences)
337
339 """Determines whether the text starts with a capital letter."""
340 stripped = text.lstrip().lstrip(cls.punctuation)
341 return stripped and stripped[0].isupper()
342 capsstart = classmethod(capsstart)
343