Package translate :: Package storage :: Module trados
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.trados

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2010 Zuza Software Foundation 
  5  # 
  6  # This file is part of the Translate Toolkit. 
  7  # 
  8  # This program is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # This program is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with this program; if not, see <http://www.gnu.org/licenses/>. 
 20   
 21  """Manage the Trados .txt Translation Memory format 
 22   
 23  A Trados file looks like this:: 
 24      <TrU> 
 25      <CrD>18012000, 13:18:35 
 26      <CrU>CAROL-ANN 
 27      <UsC>0 
 28      <Seg L=EN_GB>Association for Road Safety \endash  Conference 
 29      <Seg L=DE_DE>Tagung der Gesellschaft für Verkehrssicherheit 
 30      </TrU> 
 31      <TrU> 
 32      <CrD>18012000, 13:19:14 
 33      <CrU>CAROL-ANN 
 34      <UsC>0 
 35      <Seg L=EN_GB>Road Safety Education in our Schools 
 36      <Seg L=DE_DE>Verkehrserziehung an Schulen 
 37      </TrU> 
 38  """ 
 39   
 40  import re 
 41  import time 
 42   
 43  try: 
 44      # FIXME see if we can't use lxml 
 45      from BeautifulSoup import BeautifulStoneSoup 
 46  except ImportError: 
 47      raise ImportError("BeautifulSoup is not installed") 
 48   
 49  from translate.storage import base 
 50   
 51  TRADOS_TIMEFORMAT = "%d%m%Y, %H:%M:%S" 
 52  """Time format used by Trados .txt""" 
 53   
 54  RTF_ESCAPES = { 
 55  ur"\emdash": u"—", 
 56  ur"\endash": u"–", 
 57  # Nonbreaking space equal to width of character "m" in current font. 
 58  ur"\emspace": u"\u2003", 
 59  # Nonbreaking space equal to width of character "n" in current font. 
 60  ur"\enspace": u"\u2002", 
 61  #ur"\qmspace": "",    # One-quarter em space. 
 62  ur"\bullet": u"•",     # Bullet character. 
 63  ur"\lquote": u"‘",     # Left single quotation mark. \u2018 
 64  ur"\rquote": u"’",     # Right single quotation mark. \u2019 
 65  ur"\ldblquote": u"“",  # Left double quotation mark. \u201C 
 66  ur"\rdblquote": u"”",  # Right double quotation mark. \u201D 
 67  ur"\~": u"\u00a0", # Nonbreaking space 
 68  ur"\-": u"\u00ad", # Optional hyphen. 
 69  ur"\_": u"‑", # Nonbreaking hyphen \U2011 
 70  # A hexadecimal value, based on the specified character set (may be used to 
 71  # identify 8-bit values). 
 72  #ur"\'hh": "", 
 73  } 
 74  """RTF control to Unicode map 
 75  U{<http://msdn.microsoft.com/en-us/library/aa140283%28v=office.10%29.aspx#rtfspec_specialchar>} 
 76  """ 
 77   
 78   
79 -def unescape(text):
80 """Convert Trados text to normal Unicode string""" 81 for trados_escape, char in RTF_ESCAPES.iteritems(): 82 text = text.replace(trados_escape, char) 83 return text
84 85
86 -def escape(text):
87 """Convert Unicode string to Trodas escapes""" 88 for trados_escape, char in RTF_ESCAPES.iteritems(): 89 text = text.replace(char, trados_escape) 90 return text
91 92
93 -class TradosTxtDate(object):
94 """Manages the timestamps in the Trados .txt format of DDMMYYY, hh:mm:ss""" 95
96 - def __init__(self, newtime=None):
97 self._time = None 98 if newtime: 99 if isinstance(newtime, basestring): 100 self.timestring = newtime 101 elif isinstance(newtime, time.struct_time): 102 self.time = newtime
103
104 - def get_timestring(self):
105 """Get the time in the Trados time format""" 106 if not self._time: 107 return None 108 else: 109 return time.strftime(TRADOS_TIMEFORMAT, self._time)
110
111 - def set_timestring(self, timestring):
112 """Set the time_struct object using a Trados time formated string 113 114 @param timestring: A Trados time string (DDMMYYYY, hh:mm:ss) 115 @type timestring: String 116 """ 117 self._time = time.strptime(timestring, TRADOS_TIMEFORMAT)
118 timestring = property(get_timestring, set_timestring) 119
120 - def get_time(self):
121 """Get the time_struct object""" 122 return self._time
123
124 - def set_time(self, newtime):
125 """Set the time_struct object 126 127 @param newtime: a new time object 128 @type newtime: time.time_struct 129 """ 130 if newtime and isinstance(newtime, time.struct_time): 131 self._time = newtime 132 else: 133 self._time = None
134 time = property(get_time, set_time) 135
136 - def __str__(self):
137 if not self.timestring: 138 return "" 139 else: 140 return self.timestring
141 142
143 -class TradosUnit(base.TranslationUnit):
144
145 - def __init__(self, source=None):
146 self._soup = None 147 super(TradosUnit, self).__init__(source)
148
149 - def getsource(self):
150 return unescape(self._soup.findAll('seg')[0].contents[0])
151 source = property(getsource, None) 152
153 - def gettarget(self):
154 return unescape(self._soup.findAll('seg')[1].contents[0])
155 target = property(gettarget, None)
156 157
158 -class TradosSoup(BeautifulStoneSoup):
159 160 MARKUP_MASSAGE = [ 161 (re.compile('<(?P<fulltag>(?P<tag>[^\s\/]+).*?)>(?P<content>.+)\r'), 162 lambda x: '<%(fulltag)s>%(content)s</%(tag)s>' % x.groupdict()), 163 ]
164 165
166 -class TradosTxtTmFile(base.TranslationStore):
167 """A Trados translation memory file""" 168 Name = _("Trados Translation Memory") 169 Mimetypes = ["application/x-trados-tm"] 170 Extensions = ["txt"] 171
172 - def __init__(self, inputfile=None, unitclass=TradosUnit):
173 """construct a Wordfast TM, optionally reading in from inputfile.""" 174 self.UnitClass = unitclass 175 base.TranslationStore.__init__(self, unitclass=unitclass) 176 self.filename = '' 177 self._encoding = 'iso-8859-1' 178 if inputfile is not None: 179 self.parse(inputfile)
180
181 - def parse(self, input):
182 if hasattr(input, 'name'): 183 self.filename = input.name 184 elif not getattr(self, 'filename', ''): 185 self.filename = '' 186 if hasattr(input, "read"): 187 tmsrc = input.read() 188 input.close() 189 input = tmsrc 190 self._soup = TradosSoup(input) 191 for tu in self._soup.findAll('tru'): 192 unit = TradosUnit() 193 unit._soup = TradosSoup(str(tu)) 194 self.addunit(unit)
195
196 - def __str__(self):
197 # FIXME turn the lowercased tags back into mixed case 198 return self._soup.prettify()
199