darcs - darcsweb

repos / darcsweb / commitdiff

Rewrite encoding handling.

Fri Feb 24 00:31:28 UTC 2006 Alberto Bertogli <albertogli@telpin.com.ar>

* Rewrite encoding handling.

This patch rewrites fixu8() and affects a couple of places where decode() was

used directly.

It has the benefit of removing the ugly previous function, replacing it

with... well, another ugly function, but with a different kind of uglyness.

It also supports multiple encodings, so if the first one fails, a second one

is tried (and so on).

There still are some corner cases with file named in alternative encodings,

but should work much better than the older code.

diff -rN -u old-darcsweb/config.py.sample new-darcsweb/config.py.sample

--- old-darcsweb/config.py.sample 2013-07-29 10:13:25.000000000 +0000

+++ new-darcsweb/config.py.sample 2013-07-29 10:13:25.000000000 +0000

@@ -80,6 +80,9 @@

# like 'utf-8' or 'UTF8') if you expect darcsweb to work properly.

# This is because to workaround a bug in darcs we need to do some

# codec mangling and it needs special cases for UTF8.

+ # You can, optionally, specify multiple encodings; they're tried in

+ # order, and if one fails to decode a string, the next one is tried.

+ # Example: repoencoding = "utf8", "latin1"

repoencoding = "latin1"

# as with the base configuration, the footer is also optional, and it

diff -rN -u old-darcsweb/darcsweb.cgi new-darcsweb/darcsweb.cgi

--- old-darcsweb/darcsweb.cgi 2013-07-29 10:13:25.000000000 +0000

+++ new-darcsweb/darcsweb.cgi 2013-07-29 10:13:25.000000000 +0000

@@ -88,42 +88,24 @@

# I _hate_ this.

def fixu8(s):

- openpos = s.find('[_')

- if openpos < 0:

- # small optimization to avoid the conversion to utf8 and

- # entering the loop

- if type(s) == unicode:

- # workaround for python < 2.4

- return s.encode('utf8')

- else:

- return s.decode(config.repoencoding).encode('utf8')

+ """Calls _fixu8(), which does the real work, line by line. Otherwise

+ we choose the wrong encoding for big buffers and end up messing

+ output."""

+ n = []

+ for i in s.split('\n'):

+ n.append(_fixu8(i))

+ return string.join(n, '\n')

+def _fixu8(s):

+ if type(s) == unicode:

+ return s.encode('utf8', 'replace')

+ for e in config.repoencoding:

+ try:

+ return s.decode(e).encode('utf8', 'replace')

+ except UnicodeDecodeError:

+ pass

+ raise 'DecodingError', config.repoencoding

- s = s.encode(config.repoencoding).decode('raw_unicode_escape')

- while openpos >= 0:

- closepos = s.find('_]', openpos)

- if closepos < 0:

- # not closed, probably just luck

- break

- # middle should be something like 'c3', so we get it by

- # removing the first three characters ("[_\")

- middle = s[openpos + 3:closepos]

- if len(middle) == 2:

- # now we turn middle into the character "\xc3"

- char = chr(int(middle, 16))

- # finally, replace s with our new improved string, and

- # repeat the ugly procedure

- char = char.decode(config.repoencoding)

- mn = '[_\\' + middle + '_]'

- s = s.replace(mn, char, 1)

- openpos = s.find('[_', openpos + 1)

- if config.repoencoding != 'utf8':

- s = s.encode('utf8')

- else:

- s = s.encode('raw_unicode_escape', 'replace')

- return s

def escape(s):

s = xml_escape(s)

@@ -585,7 +567,7 @@

"""Runs darcs on the repodir with the given params, return a file

object with its output."""

os.chdir(config.repodir)

- cmd = config.darcspath + "darcs " + params

+ cmd = 'DARCS_DONT_ESCAPE_8BIT=1 ' + config.darcspath + "darcs " + params

inf, outf = os.popen4(cmd, 't')

darcs_runs.append(params)

return outf

@@ -643,6 +625,24 @@

return i

return ''

+class XmlInputWrapper:

+ def __init__(self, fd):

+ self.fd = fd

+ self.times = 0

+ self._read = self.read

+ def read(self, *args, **kwargs):

+ self.times += 1

+ if self.times == 1:

+ return '<?xml version="1.0" encoding="utf-8"?>\n'

+ s = self.fd.read(*args, **kwargs)

+ if not s:

+ return s

+ return fixu8(s)

+ def close(self, *args, **kwargs):

+ return self.fd.close(*args, **kwargs)

# patch parsing, we get them through "darcs changes --xml-output"

class BuildPatchList(xml.sax.handler.ContentHandler):

@@ -812,7 +812,7 @@

# get the xml output and parse it

xmlf = run_darcs("changes --xml-output " + params)

- parser.parse(xmlf)

+ parser.parse(XmlInputWrapper(xmlf))

xmlf.close()

return handler

@@ -1009,7 +1009,7 @@

def print_diff(dsrc):

for l in dsrc:

- l = l.decode(config.repoencoding, 'replace').encode('utf-8')

+ l = fixu8(l)

# remove the trailing newline

if len(l) > 1:

@@ -1035,7 +1035,7 @@

def print_darcs_diff(dsrc):

for l in dsrc:

- l = l.decode(config.repoencoding, 'replace').encode('utf-8')

+ l = fixu8(l)

if not l.startswith(" "):

# comments and normal stuff

@@ -2185,7 +2185,13 @@

repodir = rdir

repodesc = desc

repourl = url

- repoencoding = c.repoencoding

+ # repoencoding must be a tuple

+ if c.repoencoding is str:

+ repoencoding = (c.repoencoding, )

+ else:

+ repoencoding = c.repoencoding

if 'footer' in dir(c):

footer = c.footer

config.__setattr__(name, tmp_config)

Sally sells sea-shells by the sea shore.

RSS