Package translate :: Package storage :: Module dtd
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.dtd

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2002-2006 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """classes that hold units of .dtd files (dtdunit) or entire files (dtdfile) 
 23  these are specific .dtd files for localisation used by mozilla 
 24   
 25  Specifications 
 26  ============== 
 27  The following information is provided by Mozilla:: 
 28   
 29  *  U{Specification<http://www.w3.org/TR/REC-xml/#sec-entexpand>} 
 30   
 31  There is a grammar for entity definitions, which isn't really precise, 
 32  as the spec says.  There's no formal specification for DTD files, it's 
 33  just "whatever makes this work" basically. The whole piece is clearly not 
 34  the strongest point of the xml spec 
 35   
 36  XML elements are allowed in entity values. A number of things that are 
 37  allowed will just break the resulting document, Mozilla forbids these 
 38  in their DTD parser. 
 39  """ 
 40   
 41  from translate.storage import base 
 42  from translate.misc import quote 
 43   
 44  import re 
 45  import warnings 
 46  try: 
 47      from lxml import etree 
 48      import StringIO 
 49  except ImportError: 
 50      etree = None 
 51   
 52  labelsuffixes = (".label", ".title") 
 53  """Label suffixes: entries with this suffix are able to be comibed with accesskeys 
 54  found in in entries ending with L{accesskeysuffixes}""" 
 55  accesskeysuffixes = (".accesskey", ".accessKey", ".akey") 
 56  """Accesskey Suffixes: entries with this suffix may be combined with labels 
 57  ending in L{labelsuffixes} into accelerator notation""" 
 58   
 59   
60 -def quotefordtd(source):
61 if '"' in source: 62 if "'" in source: 63 return "'" + source.replace("'", '&apos;') + "'" 64 else: 65 return quote.singlequotestr(source) 66 else: 67 return quote.quotestr(source)
68 69
70 -def unquotefromdtd(source):
71 """unquotes a quoted dtd definition""" 72 # extract the string, get rid of quoting 73 if len(source) == 0: 74 source = '""' 75 quotechar = source[0] 76 extracted, quotefinished = quote.extractwithoutquotes(source, quotechar, quotechar, allowreentry=False) 77 if quotechar == "'" and "&apos;" in extracted: 78 extracted = extracted.replace("&apos;", "'") 79 # the quote characters should be the first and last characters in the string 80 # of course there could also be quote characters within the string; not handled here 81 return extracted
82 83
84 -def removeinvalidamps(name, value):
85 """Find and remove ampersands that are not part of an entity definition. 86 87 A stray & in a DTD file can break an applications ability to parse the file. In Mozilla 88 localisation this is very important and these can break the parsing of files used in XUL 89 and thus break interface rendering. Tracking down the problem is very difficult, 90 thus by removing potential broken & and warning the users we can ensure that the output 91 DTD will always be parsable. 92 93 @type name: String 94 @param name: Entity name 95 @type value: String 96 @param value: Entity text value 97 @rtype: String 98 @return: Entity value without bad ampersands 99 """ 100 101 def is_valid_entity_name(name): 102 """Check that supplied L{name} is a valid entity name""" 103 if name.replace('.', '').isalnum(): 104 return True 105 elif name[0] == '#' and name[1:].isalnum(): 106 return True 107 return False
108 109 amppos = 0 110 invalid_amps = [] 111 while amppos >= 0: 112 amppos = value.find("&", amppos) 113 if amppos != -1: 114 amppos += 1 115 semipos = value.find(";", amppos) 116 if semipos != -1: 117 if is_valid_entity_name(value[amppos:semipos]): 118 continue 119 invalid_amps.append(amppos-1) 120 if len(invalid_amps) > 0: 121 warnings.warn("invalid ampersands in dtd entity %s" % (name)) 122 adjustment = 0 123 for amppos in invalid_amps: 124 value = value[:amppos-adjustment] + value[amppos-adjustment+1:] 125 adjustment += 1 126 return value 127 128
129 -class dtdunit(base.TranslationUnit):
130 """this class represents an entity definition from a dtd file (and possibly associated comments)""" 131
132 - def __init__(self, source=""):
133 """construct the dtdunit, prepare it for parsing""" 134 super(dtdunit, self).__init__(source) 135 self.comments = [] 136 self.unparsedlines = [] 137 self.incomment = False 138 self.inentity = False 139 self.entity = "FakeEntityOnlyForInitialisationAndTesting" 140 self.source = source 141 self.space_pre_entity = ' ' 142 self.space_pre_definition = ' ' 143 self.closing = ">"
144 145 # Note that source and target are equivalent for monolingual units
146 - def setsource(self, source):
147 """Sets the definition to the quoted value of source""" 148 self.definition = quotefordtd(source) 149 self._rich_source = None
150
151 - def getsource(self):
152 """gets the unquoted source string""" 153 return unquotefromdtd(self.definition)
154 source = property(getsource, setsource) 155
156 - def settarget(self, target):
157 """Sets the definition to the quoted value of target""" 158 if target is None: 159 target = "" 160 self.definition = quotefordtd(target) 161 self._rich_target = None
162
163 - def gettarget(self):
164 """gets the unquoted target string""" 165 return unquotefromdtd(self.definition)
166 target = property(gettarget, settarget) 167
168 - def isnull(self):
169 """returns whether this dtdunit doesn't actually have an entity definition""" 170 # for dtds, we currently return a blank string if there is no .entity (==location in other files) 171 # TODO: this needs to work better with base class expectations 172 return self.entity is None
173
174 - def parse(self, dtdsrc):
175 """read the first dtd element from the source code into this object, return linesprocessed""" 176 self.comments = [] 177 # make all the lists the same 178 self.locfilenotes = self.comments 179 self.locgroupstarts = self.comments 180 self.locgroupends = self.comments 181 self.locnotes = self.comments 182 # self.locfilenotes = [] 183 # self.locgroupstarts = [] 184 # self.locgroupends = [] 185 # self.locnotes = [] 186 # self.comments = [] 187 self.entity = None 188 self.definition = '' 189 if not dtdsrc: 190 return 0 191 lines = dtdsrc.split("\n") 192 linesprocessed = 0 193 comment = "" 194 for line in lines: 195 line += "\n" 196 linesprocessed += 1 197 # print "line(%d,%d): " % (self.incomment,self.inentity),line[:-1] 198 if not self.incomment: 199 if (line.find('<!--') != -1): 200 self.incomment = True 201 self.continuecomment = False 202 # now work out the type of comment, and save it (remember we're not in the comment yet) 203 (comment, dummy) = quote.extract(line, "<!--", "-->", None, 0) 204 if comment.find('LOCALIZATION NOTE') != -1: 205 l = quote.findend(comment, 'LOCALIZATION NOTE') 206 while (comment[l] == ' '): 207 l += 1 208 if comment.find('FILE', l) == l: 209 self.commenttype = "locfile" 210 elif comment.find('BEGIN', l) == l: 211 self.commenttype = "locgroupstart" 212 elif comment.find('END', l) == l: 213 self.commenttype = "locgroupend" 214 else: 215 self.commenttype = "locnote" 216 else: 217 # plain comment 218 self.commenttype = "comment" 219 #FIXME: bloody entity might share a line with something important 220 elif not self.inentity and re.search("%.*;", line): 221 # now work out the type of comment, and save it (remember we're not in the comment yet) 222 self.comments.append(("comment", line)) 223 line = "" 224 continue 225 226 if self.incomment: 227 # some kind of comment 228 (comment, self.incomment) = quote.extract(line, "<!--", "-->", None, self.continuecomment) 229 # print "comment(%d,%d): " % (self.incomment,self.continuecomment),comment 230 self.continuecomment = self.incomment 231 # strip the comment out of what will be parsed 232 line = line.replace(comment, "", 1) 233 # add a end of line of this is the end of the comment 234 if not self.incomment: 235 if line.isspace(): 236 comment += line 237 line = '' 238 else: 239 comment += '\n' 240 # check if there's actually an entity definition that's commented out 241 # TODO: parse these, store as obsolete messages 242 # if comment.find('<!ENTITY') != -1: 243 # # remove the entity from the comment 244 # comment, dummy = quote.extractwithoutquotes(comment, ">", "<!ENTITY", None, 1) 245 # depending on the type of comment (worked out at the start), put it in the right place 246 # make it record the comment and type as a tuple 247 commentpair = (self.commenttype, comment) 248 if self.commenttype == "locfile": 249 self.locfilenotes.append(commentpair) 250 elif self.commenttype == "locgroupstart": 251 self.locgroupstarts.append(commentpair) 252 elif self.commenttype == "locgroupend": 253 self.locgroupends.append(commentpair) 254 elif self.commenttype == "locnote": 255 self.locnotes.append(commentpair) 256 elif self.commenttype == "comment": 257 self.comments.append(commentpair) 258 259 if not self.inentity and not self.incomment: 260 entitypos = line.find('<!ENTITY') 261 if entitypos != -1: 262 self.inentity = True 263 beforeentity = line[:entitypos].strip() 264 if beforeentity.startswith("#"): 265 self.hashprefix = beforeentity 266 self.entitypart = "start" 267 else: 268 self.unparsedlines.append(line) 269 270 if self.inentity: 271 if self.entitypart == "start": 272 # the entity definition 273 e = quote.findend(line, '<!ENTITY') 274 line = line[e:] 275 self.entitypart = "name" 276 self.entitytype = "internal" 277 if self.entitypart == "name": 278 s = 0 279 e = 0 280 while (e < len(line) and line[e].isspace()): 281 e += 1 282 self.space_pre_entity = ' ' * (e - s) 283 s = e 284 self.entity = '' 285 if (e < len(line) and line[e] == '%'): 286 self.entitytype = "external" 287 self.entityparameter = "" 288 e += 1 289 while (e < len(line) and line[e].isspace()): 290 e += 1 291 while (e < len(line) and not line[e].isspace()): 292 self.entity += line[e] 293 e += 1 294 s = e 295 while (e < len(line) and line[e].isspace()): 296 e += 1 297 self.space_pre_definition = ' ' * (e - s) 298 if self.entity: 299 if self.entitytype == "external": 300 self.entitypart = "parameter" 301 else: 302 self.entitypart = "definition" 303 # remember the start position and the quote character 304 if e == len(line): 305 self.entityhelp = None 306 e = 0 307 continue 308 elif self.entitypart == "definition": 309 self.entityhelp = (e, line[e]) 310 self.instring = False 311 if self.entitypart == "parameter": 312 while (e < len(line) and line[e].isspace()): 313 e += 1 314 paramstart = e 315 while (e < len(line) and line[e].isalnum()): 316 e += 1 317 self.entityparameter += line[paramstart:e] 318 while (e < len(line) and line[e].isspace()): 319 e += 1 320 line = line[e:] 321 e = 0 322 if not line: 323 continue 324 if line[0] in ('"', "'"): 325 self.entitypart = "definition" 326 self.entityhelp = (e, line[e]) 327 self.instring = False 328 if self.entitypart == "definition": 329 if self.entityhelp is None: 330 e = 0 331 while (e < len(line) and line[e].isspace()): 332 e += 1 333 if e == len(line): 334 continue 335 self.entityhelp = (e, line[e]) 336 self.instring = False 337 # actually the lines below should remember instring, rather than using it as dummy 338 e = self.entityhelp[0] 339 if (self.entityhelp[1] == "'"): 340 (defpart, self.instring) = quote.extract(line[e:], "'", "'", startinstring=self.instring, allowreentry=False) 341 elif (self.entityhelp[1] == '"'): 342 (defpart, self.instring) = quote.extract(line[e:], '"', '"', startinstring=self.instring, allowreentry=False) 343 else: 344 raise ValueError("Unexpected quote character... %r" % (self.entityhelp[1])) 345 # for any following lines, start at the beginning of the line. remember the quote character 346 self.entityhelp = (0, self.entityhelp[1]) 347 self.definition += defpart 348 if not self.instring: 349 self.closing = line[e+len(defpart):].rstrip("\n\r") 350 self.inentity = False 351 break 352 353 # uncomment this line to debug processing 354 if 0: 355 for attr in dir(self): 356 r = repr(getattr(self, attr)) 357 if len(r) > 60: 358 r = r[:57] + "..." 359 self.comments.append(("comment", "self.%s = %s" % (attr, r))) 360 return linesprocessed
361
362 - def __str__(self):
363 """convert to a string. double check that unicode is handled somehow here""" 364 source = self.getoutput() 365 if isinstance(source, unicode): 366 return source.encode(getattr(self, "encoding", "UTF-8")) 367 return source
368
369 - def getoutput(self):
370 """convert the dtd entity back to string form""" 371 lines = [] 372 lines.extend([comment for commenttype, comment in self.comments]) 373 lines.extend(self.unparsedlines) 374 if self.isnull(): 375 result = "".join(lines) 376 return result.rstrip() + "\n" 377 # for f in self.locfilenotes: yield f 378 # for ge in self.locgroupends: yield ge 379 # for gs in self.locgroupstarts: yield gs 380 # for n in self.locnotes: yield n 381 if len(self.entity) > 0: 382 if getattr(self, 'entitytype', None) == 'external': 383 entityline = '<!ENTITY % ' + self.entity + ' ' + self.entityparameter + ' ' + self.definition + self.closing 384 else: 385 entityline = '<!ENTITY' + self.space_pre_entity + self.entity + self.space_pre_definition + self.definition + self.closing 386 if getattr(self, 'hashprefix', None): 387 entityline = self.hashprefix + " " + entityline 388 if isinstance(entityline, unicode): 389 entityline = entityline.encode('UTF-8') 390 lines.append(entityline + '\n') 391 return "".join(lines)
392 393
394 -class dtdfile(base.TranslationStore):
395 """this class represents a .dtd file, made up of dtdunits""" 396 UnitClass = dtdunit 397
398 - def __init__(self, inputfile=None):
399 """construct a dtdfile, optionally reading in from inputfile""" 400 base.TranslationStore.__init__(self, unitclass=self.UnitClass) 401 self.filename = getattr(inputfile, 'name', '') 402 if inputfile is not None: 403 dtdsrc = inputfile.read() 404 self.parse(dtdsrc) 405 self.makeindex()
406
407 - def parse(self, dtdsrc):
408 """read the source code of a dtd file in and include them as dtdunits in self.units""" 409 start = 0 410 end = 0 411 lines = dtdsrc.split("\n") 412 while end < len(lines): 413 if (start == end): 414 end += 1 415 foundentity = False 416 while end < len(lines): 417 if end >= len(lines): 418 break 419 if lines[end].find('<!ENTITY') > -1: 420 foundentity = True 421 if foundentity and re.match("[\"']\s*>", lines[end]): 422 end += 1 423 break 424 end += 1 425 # print "processing from %d to %d" % (start,end) 426 427 linesprocessed = 1 # to initialise loop 428 while linesprocessed >= 1: 429 newdtd = dtdunit() 430 try: 431 linesprocessed = newdtd.parse("\n".join(lines[start:end])) 432 if linesprocessed >= 1 and (not newdtd.isnull() or newdtd.unparsedlines): 433 self.units.append(newdtd) 434 except Exception, e: 435 warnings.warn("%s\nError occured between lines %d and %d:\n%s" % (e, start+1, end, "\n".join(lines[start:end]))) 436 start += linesprocessed
437
438 - def __str__(self):
439 """convert to a string. double check that unicode is handled somehow here""" 440 source = self.getoutput() 441 if not self._valid_store(): 442 warnings.warn("DTD file '%s' does not validate" % self.filename) 443 return None 444 if isinstance(source, unicode): 445 return source.encode(getattr(self, "encoding", "UTF-8")) 446 return source
447
448 - def getoutput(self):
449 """convert the units back to source""" 450 sources = [str(dtd) for dtd in self.units] 451 return "".join(sources)
452
453 - def makeindex(self):
454 """makes self.index dictionary keyed on entities""" 455 self.index = {} 456 for dtd in self.units: 457 if not dtd.isnull(): 458 self.index[dtd.entity] = dtd
459
460 - def _valid_store(self):
461 """Validate the store to determine if it is valid 462 463 This uses ElementTree to parse the DTD 464 465 @return: If the store passes validation 466 @rtype: Boolean 467 """ 468 if etree is not None: 469 try: 470 # #expand is a Mozilla hack and are removed as they are not valid in DTDs 471 dtd = etree.DTD(StringIO.StringIO(re.sub("#expand", "", self.getoutput()))) 472 except etree.DTDParseError: 473 return False 474 return True
475