darcs - darcsweb

repos / darcsweb / darcs_commitdiff

Rewrite encoding handling.

Fri Feb 24 00:31:28 UTC 2006 Alberto Bertogli <albertogli@telpin.com.ar>

* Rewrite encoding handling.

This patch rewrites fixu8() and affects a couple of places where decode() was

used directly.

It has the benefit of removing the ugly previous function, replacing it

with... well, another ugly function, but with a different kind of uglyness.

It also supports multiple encodings, so if the first one fails, a second one

is tried (and so on).

There still are some corner cases with file named in alternative encodings,

but should work much better than the older code.

{

hunk ./config.py.sample 83

+ # You can, optionally, specify multiple encodings; they're tried in

+ # order, and if one fails to decode a string, the next one is tried.

+ # Example: repoencoding = "utf8", "latin1"

hunk ./darcsweb.cgi 91

- openpos = s.find('[_')

- if openpos < 0:

- # small optimization to avoid the conversion to utf8 and

- # entering the loop

- if type(s) == unicode:

- # workaround for python < 2.4

- return s.encode('utf8')

- else:

- return s.decode(config.repoencoding).encode('utf8')

- s = s.encode(config.repoencoding).decode('raw_unicode_escape')

- while openpos >= 0:

- closepos = s.find('_]', openpos)

- if closepos < 0:

- # not closed, probably just luck

- break

- # middle should be something like 'c3', so we get it by

- # removing the first three characters ("[_\")

- middle = s[openpos + 3:closepos]

- if len(middle) == 2:

- # now we turn middle into the character "\xc3"

- char = chr(int(middle, 16))

+ """Calls _fixu8(), which does the real work, line by line. Otherwise

+ we choose the wrong encoding for big buffers and end up messing

+ output."""

+ n = []

+ for i in s.split('\n'):

+ n.append(_fixu8(i))

+ return string.join(n, '\n')

hunk ./darcsweb.cgi 99

- # finally, replace s with our new improved string, and

- # repeat the ugly procedure

- char = char.decode(config.repoencoding)

- mn = '[_\\' + middle + '_]'

- s = s.replace(mn, char, 1)

- openpos = s.find('[_', openpos + 1)

+def _fixu8(s):

+ if type(s) == unicode:

+ return s.encode('utf8', 'replace')

+ for e in config.repoencoding:

+ try:

+ return s.decode(e).encode('utf8', 'replace')

+ except UnicodeDecodeError:

+ pass

+ raise 'DecodingError', config.repoencoding

hunk ./darcsweb.cgi 109

- if config.repoencoding != 'utf8':

- s = s.encode('utf8')

- else:

- s = s.encode('raw_unicode_escape', 'replace')

- return s

hunk ./darcsweb.cgi 570

- cmd = config.darcspath + "darcs " + params

+ cmd = 'DARCS_DONT_ESCAPE_8BIT=1 ' + config.darcspath + "darcs " + params

hunk ./darcsweb.cgi 628

+class XmlInputWrapper:

+ def __init__(self, fd):

+ self.fd = fd

+ self.times = 0

+ self._read = self.read

+ def read(self, *args, **kwargs):

+ self.times += 1

+ if self.times == 1:

+ return '<?xml version="1.0" encoding="utf-8"?>\n'

+ s = self.fd.read(*args, **kwargs)

+ if not s:

+ return s

+ return fixu8(s)

+ def close(self, *args, **kwargs):

+ return self.fd.close(*args, **kwargs)

hunk ./darcsweb.cgi 815

- parser.parse(xmlf)

+ parser.parse(XmlInputWrapper(xmlf))

hunk ./darcsweb.cgi 1012

- l = l.decode(config.repoencoding, 'replace').encode('utf-8')

+ l = fixu8(l)

hunk ./darcsweb.cgi 1038

- l = l.decode(config.repoencoding, 'replace').encode('utf-8')

+ l = fixu8(l)

hunk ./darcsweb.cgi 2188

- repoencoding = c.repoencoding

+ # repoencoding must be a tuple

+ if c.repoencoding is str:

+ repoencoding = (c.repoencoding, )

+ else:

+ repoencoding = c.repoencoding

}

Sally sells sea-shells by the sea shore.

RSS