Fri Feb 24 00:31:28 UTC 2006 Alberto Bertogli <albertogli@telpin.com.ar>
* Rewrite encoding handling.
This patch rewrites fixu8() and affects a couple of places where decode() was
used directly.
It has the benefit of removing the ugly previous function, replacing it
with... well, another ugly function, but with a different kind of uglyness.
It also supports multiple encodings, so if the first one fails, a second one
is tried (and so on).
There still are some corner cases with file named in alternative encodings,
but should work much better than the older code.
{
hunk ./config.py.sample 83
+ # You can, optionally, specify multiple encodings; they're tried in
+ # order, and if one fails to decode a string, the next one is tried.
+ # Example: repoencoding = "utf8", "latin1"
hunk ./darcsweb.cgi 91
- openpos = s.find('[_')
- if openpos < 0:
- # small optimization to avoid the conversion to utf8 and
- # entering the loop
- if type(s) == unicode:
- # workaround for python < 2.4
- return s.encode('utf8')
- else:
- return s.decode(config.repoencoding).encode('utf8')
-
- s = s.encode(config.repoencoding).decode('raw_unicode_escape')
- while openpos >= 0:
- closepos = s.find('_]', openpos)
- if closepos < 0:
- # not closed, probably just luck
- break
-
- # middle should be something like 'c3', so we get it by
- # removing the first three characters ("[_\")
- middle = s[openpos + 3:closepos]
- if len(middle) == 2:
- # now we turn middle into the character "\xc3"
- char = chr(int(middle, 16))
+ """Calls _fixu8(), which does the real work, line by line. Otherwise
+ we choose the wrong encoding for big buffers and end up messing
+ output."""
+ n = []
+ for i in s.split('\n'):
+ n.append(_fixu8(i))
+ return string.join(n, '\n')
hunk ./darcsweb.cgi 99
- # finally, replace s with our new improved string, and
- # repeat the ugly procedure
- char = char.decode(config.repoencoding)
- mn = '[_\\' + middle + '_]'
- s = s.replace(mn, char, 1)
- openpos = s.find('[_', openpos + 1)
+def _fixu8(s):
+ if type(s) == unicode:
+ return s.encode('utf8', 'replace')
+ for e in config.repoencoding:
+ try:
+ return s.decode(e).encode('utf8', 'replace')
+ except UnicodeDecodeError:
+ pass
+ raise 'DecodingError', config.repoencoding
hunk ./darcsweb.cgi 109
- if config.repoencoding != 'utf8':
- s = s.encode('utf8')
- else:
- s = s.encode('raw_unicode_escape', 'replace')
- return s
hunk ./darcsweb.cgi 570
- cmd = config.darcspath + "darcs " + params
+ cmd = 'DARCS_DONT_ESCAPE_8BIT=1 ' + config.darcspath + "darcs " + params
hunk ./darcsweb.cgi 628
+class XmlInputWrapper:
+ def __init__(self, fd):
+ self.fd = fd
+ self.times = 0
+ self._read = self.read
+
+ def read(self, *args, **kwargs):
+ self.times += 1
+ if self.times == 1:
+ return '<?xml version="1.0" encoding="utf-8"?>\n'
+ s = self.fd.read(*args, **kwargs)
+ if not s:
+ return s
+ return fixu8(s)
+
+ def close(self, *args, **kwargs):
+ return self.fd.close(*args, **kwargs)
+
hunk ./darcsweb.cgi 815
- parser.parse(xmlf)
+ parser.parse(XmlInputWrapper(xmlf))
hunk ./darcsweb.cgi 1012
- l = l.decode(config.repoencoding, 'replace').encode('utf-8')
+ l = fixu8(l)
hunk ./darcsweb.cgi 1038
- l = l.decode(config.repoencoding, 'replace').encode('utf-8')
+ l = fixu8(l)
hunk ./darcsweb.cgi 2188
- repoencoding = c.repoencoding
+
+ # repoencoding must be a tuple
+ if c.repoencoding is str:
+ repoencoding = (c.repoencoding, )
+ else:
+ repoencoding = c.repoencoding
+
}