Package translate :: Package misc :: Module quote
[hide private]
[frames] | no frames]

Source Code for Module translate.misc.quote

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2002-2006 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """String processing utilities for extracting strings with various kinds 
 23  of delimiters""" 
 24   
 25  import logging 
 26  import htmlentitydefs 
 27   
 28  from translate.misc.typecheck import accepts, returns 
29 30 31 -def find_all(searchin, substr):
32 """Returns a list of locations where substr occurs in searchin 33 locations are not allowed to overlap""" 34 location = 0 35 locations = [] 36 while location != -1: 37 location = searchin.find(substr, location) 38 if location != -1: 39 locations.append(location) 40 location += len(substr) 41 return locations
42
43 44 -def extract(source, startdelim, enddelim, 45 escape=None, startinstring=False, allowreentry=True):
46 """Extracts a doublequote-delimited string from a string, allowing for 47 backslash-escaping returns tuple of (quoted string with quotes, still in 48 string at end). 49 """ 50 # Note that this returns the quote characters as well... even internally 51 instring = startinstring 52 enteredonce = False 53 lenstart = len(startdelim) 54 lenend = len(enddelim) 55 startdelim_places = find_all(source, startdelim) 56 if startdelim == enddelim: 57 enddelim_places = startdelim_places[:] 58 else: 59 enddelim_places = find_all(source, enddelim) 60 if escape is not None: 61 lenescape = len(escape) 62 escape_places = find_all(source, escape) 63 # Filter escaped escapes 64 true_escape = False 65 true_escape_places = [] 66 for escape_pos in escape_places: 67 if escape_pos - lenescape in escape_places: 68 true_escape = not true_escape 69 else: 70 true_escape = True 71 if true_escape: 72 true_escape_places.append(escape_pos) 73 startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places] 74 enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places] 75 else: 76 enddelim_places = [pos + lenend for pos in enddelim_places] 77 # Get a unique sorted list of the significant places in the string 78 significant_places = [0] + startdelim_places + enddelim_places + [len(source)-1] 79 significant_places.sort() 80 extracted = "" 81 lastpos = None 82 for pos in significant_places: 83 if instring and pos in enddelim_places: 84 # Make sure that if startdelim == enddelim we don't get confused 85 # and count the same string as start and end. 86 if lastpos == pos - lenstart and lastpos in startdelim_places: 87 continue 88 extracted += source[lastpos:pos] 89 instring = False 90 lastpos = pos 91 if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry): 92 instring = True 93 enteredonce = True 94 lastpos = pos 95 if instring: 96 extracted += source[lastpos:] 97 return (extracted, instring)
98
99 100 -def extractwithoutquotes(source, startdelim, enddelim, escape=None, 101 startinstring=False, includeescapes=True, 102 allowreentry=True):
103 """Extracts a doublequote-delimited string from a string, allowing for 104 backslash-escaping includeescapes can also be a function that takes the 105 whole escaped string and returns the replaced version. 106 """ 107 instring = startinstring 108 enteredonce = False 109 lenstart = len(startdelim) 110 lenend = len(enddelim) 111 startdelim_places = find_all(source, startdelim) 112 if startdelim == enddelim: 113 enddelim_places = startdelim_places[:] 114 else: 115 enddelim_places = find_all(source, enddelim) 116 #hell slow because it is called far too often 117 if escape is not None: 118 lenescape = len(escape) 119 escape_places = find_all(source, escape) 120 # filter escaped escapes 121 true_escape = False 122 true_escape_places = [] 123 for escape_pos in escape_places: 124 if escape_pos - lenescape in escape_places: 125 true_escape = not true_escape 126 else: 127 true_escape = True 128 if true_escape: 129 true_escape_places.append(escape_pos) 130 startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places] 131 enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places] 132 else: 133 enddelim_places = [pos + lenend for pos in enddelim_places] 134 # get a unique sorted list of the significant places in the string 135 significant_places = [0] + startdelim_places + enddelim_places + [len(source)-1] 136 significant_places.sort() 137 extracted = "" 138 lastpos = 0 139 callable_includeescapes = callable(includeescapes) 140 checkescapes = callable_includeescapes or not includeescapes 141 for pos in significant_places: 142 if instring and pos in enddelim_places and lastpos != pos - lenstart: 143 section_start, section_end = lastpos + len(startdelim), pos - len(enddelim) 144 section = source[section_start:section_end] 145 if escape is not None and checkescapes: 146 escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos <= section_end] 147 new_section = "" 148 last_epos = 0 149 for epos in escape_list: 150 new_section += section[last_epos:epos] 151 if callable_includeescapes: 152 replace_escape = includeescapes(section[epos:epos + lenescape + 1]) 153 # TODO: deprecate old method of returning boolean from 154 # includeescape, by removing this if block 155 if not isinstance(replace_escape, basestring): 156 if replace_escape: 157 replace_escape = section[epos:epos + lenescape + 1] 158 else: 159 replace_escape = section[epos + lenescape:epos + lenescape + 1] 160 new_section += replace_escape 161 last_epos = epos + lenescape + 1 162 else: 163 last_epos = epos + lenescape 164 section = new_section + section[last_epos:] 165 extracted += section 166 instring = False 167 lastpos = pos 168 if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry): 169 instring = True 170 enteredonce = True 171 lastpos = pos 172 if instring: 173 section_start = lastpos + len(startdelim) 174 section = source[section_start:] 175 if escape is not None and not includeescapes: 176 escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos] 177 new_section = "" 178 last_epos = 0 179 for epos in escape_list: 180 new_section += section[last_epos:epos] 181 if callable_includeescapes and includeescapes(section[epos:epos + lenescape + 1]): 182 last_epos = epos 183 else: 184 last_epos = epos + lenescape 185 section = new_section + section[last_epos:] 186 extracted += section 187 return (extracted, instring)
188
189 190 -def escapequotes(source, escapeescapes=0):
191 "Returns the same string, with double quotes escaped with backslash" 192 if escapeescapes: 193 return source.replace('\\', '\\\\').replace('"', '\\"') 194 else: 195 return source.replace('"', '\\"')
196
197 198 -def escapesinglequotes(source):
199 "Returns the same string, with single quotes doubled" 200 return source.replace("'", "''")
201
202 203 @accepts(unicode) 204 @returns(unicode) 205 -def htmlentityencode(source):
206 """encodes source using HTML entities e.g. © -> &copy;""" 207 output = u"" 208 for char in source: 209 charnum = ord(char) 210 if charnum in htmlentitydefs.codepoint2name: 211 output += u"&%s;" % htmlentitydefs.codepoint2name[charnum] 212 else: 213 output += str(char) 214 return output
215
216 217 @accepts(unicode) 218 @returns(unicode) 219 -def htmlentitydecode(source):
220 """decodes source using HTML entities e.g. &copy; -> ©""" 221 output = u"" 222 inentity = False 223 for char in source: 224 if char == "&": 225 inentity = True 226 possibleentity = "" 227 continue 228 if inentity: 229 if char == ";": 230 if len(possibleentity) > 0 and possibleentity in htmlentitydefs.name2codepoint: 231 output += unichr(htmlentitydefs.name2codepoint[possibleentity]) 232 inentity = False 233 else: 234 output += "&" + possibleentity + ";" 235 inentity = False 236 elif char == " ": 237 output += "&" + possibleentity + char 238 inentity = False 239 else: 240 possibleentity += char 241 else: 242 output += char 243 return output
244
245 246 @accepts(unicode) 247 @returns(unicode) 248 -def javapropertiesencode(source):
249 """Encodes source in the escaped-unicode encoding used by Java 250 .properties files 251 """ 252 output = u"" 253 if source and source[0] == u" ": 254 output = u"\\" 255 for char in source: 256 charnum = ord(char) 257 if char in controlchars: 258 output += controlchars[char] 259 elif 0 <= charnum < 128: 260 output += str(char) 261 else: 262 output += u"\\u%04X" % charnum 263 return output
264
265 266 @accepts(unicode) 267 @returns(unicode) 268 -def mozillapropertiesencode(source):
269 """Encodes source in the escaped-unicode encoding used by Mozilla 270 .properties files. 271 """ 272 output = u"" 273 for char in source: 274 if char in controlchars: 275 output += controlchars[char] 276 else: 277 output += char 278 return output
279 280 propertyescapes = { 281 # escapes that are self-escaping 282 "\\": "\\", "'": "'", '"': '"', 283 # control characters that we keep 284 "f": "\f", "n": "\n", "r": "\r", "t": "\t", 285 } 286 287 controlchars = { 288 # the reverse of the above... 289 "\\": "\\\\", 290 "\f": "\\f", "\n": "\\n", "\r": "\\r", "\t": "\\t", 291 }
292 293 294 -def escapecontrols(source):
295 """escape control characters in the given string""" 296 for key, value in controlchars.iteritems(): 297 source = source.replace(key, value) 298 return source
299
300 301 @accepts(unicode) 302 @returns(unicode) 303 -def propertiesdecode(source):
304 """Decodes source from the escaped-unicode encoding used by .properties 305 files. 306 307 Java uses Latin1 by default, and Mozilla uses UTF-8 by default. 308 309 Since the .decode("unicode-escape") routine decodes everything, and we 310 don't want to we reimplemented the algorithm from Python Objects/unicode.c 311 in Python and modify it to retain escaped control characters. 312 """ 313 output = u"" 314 s = 0 315 316 def unichr2(i): 317 """Returns a Unicode string of one character with ordinal 32 <= i, 318 otherwise an escaped control character. 319 """ 320 if 32 <= i: 321 return unichr(i) 322 elif unichr(i) in controlchars: 323 # we just return the character, unescaped 324 # if people want to escape them they can use escapecontrols 325 return unichr(i) 326 else: 327 return "\\u%04x" % i
328 329 while s < len(source): 330 c = source[s] 331 if c != '\\': 332 output += c 333 s += 1 334 continue 335 s += 1 336 if s >= len(source): 337 # this is an escape at the end of the line, which implies 338 # a continuation..., return the escape to inform the parser 339 output += c 340 continue 341 c = source[s] 342 s += 1 343 if c == '\n': 344 pass 345 # propertyescapes lookups 346 elif c in propertyescapes: 347 output += propertyescapes[c] 348 # \uXXXX escapes 349 # \UXXXX escapes 350 elif c in "uU": 351 digits = 4 352 x = 0 353 for digit in range(digits): 354 x <<= 4 355 if s + digit >= len(source): 356 digits = digit 357 break 358 c = source[s + digit].lower() 359 if c.isdigit(): 360 x += ord(c) - ord('0') 361 elif c in "abcdef": 362 x += ord(c) - ord('a') + 10 363 else: 364 break 365 s += digits 366 output += unichr2(x) 367 elif c == "N": 368 if source[s] != "{": 369 logging.warn("Invalid named unicode escape: no { after \\N") 370 output += "\\" + c 371 continue 372 s += 1 373 e = source.find("}", s) 374 if e == -1: 375 logging.warn("Invalid named unicode escape: no } after \\N{") 376 output += "\\" + c 377 continue 378 import unicodedata 379 name = source[s:e] 380 output += unicodedata.lookup(name) 381 s = e + 1 382 else: 383 output += c # Drop any \ that we don't specifically handle 384 return output 385
386 387 -def quotestr(source, escapeescapes=0):
388 """Returns a doublequote-delimited quoted string, escaping double 389 quotes with backslash. 390 """ 391 if isinstance(source, list): 392 firstline = True 393 for line in source: 394 if firstline: 395 newsource = '"' + escapequotes(line, escapeescapes) + '"' 396 firstline = False 397 else: 398 newsource = newsource + '\n' + \ 399 '"' + escapequotes(line, escapeescapes) + '"' 400 return newsource 401 else: 402 return '"' + escapequotes(source, escapeescapes) + '"'
403
404 405 -def singlequotestr(source):
406 """Returns a doublequote-delimited quoted string, escaping single quotes 407 with themselves. 408 """ 409 return "'" + escapesinglequotes(source) + "'"
410
411 412 -def findend(string, substring):
413 s = string.find(substring) 414 if s != -1: 415 s += len(substring) 416 return s
417
418 419 -def rstripeol(string):
420 return string.rstrip("\r\n")
421
422 423 -def stripcomment(comment, startstring="<!--", endstring="-->"):
424 cstart = comment.find(startstring) 425 if cstart == -1: 426 cstart = 0 427 else: 428 cstart += len(startstring) 429 cend = comment.find(endstring, cstart) 430 return comment[cstart:cend].strip()
431
432 433 -def unstripcomment(comment, startstring="<!-- ", endstring=" -->\n"):
434 return startstring + comment.strip() + endstring
435