Rewrite encoding handling.
Fri Feb 24 00:31:28 UTC 2006 Alberto Bertogli <albertogli@telpin.com.ar>
* Rewrite encoding handling.
This patch rewrites fixu8() and affects a couple of places where decode() was
used directly.
It has the benefit of removing the ugly previous function, replacing it
with... well, another ugly function, but with a different kind of uglyness.
It also supports multiple encodings, so if the first one fails, a second one
is tried (and so on).
There still are some corner cases with file named in alternative encodings,
but should work much better than the older code.
diff -rN -u old-darcsweb/config.py.sample new-darcsweb/config.py.sample
--- old-darcsweb/config.py.sample 2013-07-29 10:13:25.000000000 +0000
+++ new-darcsweb/config.py.sample 2013-07-29 10:13:25.000000000 +0000
@@ -80,6 +80,9 @@
# like 'utf-8' or 'UTF8') if you expect darcsweb to work properly.
# This is because to workaround a bug in darcs we need to do some
# codec mangling and it needs special cases for UTF8.
+ # You can, optionally, specify multiple encodings; they're tried in
+ # order, and if one fails to decode a string, the next one is tried.
+ # Example: repoencoding = "utf8", "latin1"
repoencoding = "latin1"
# as with the base configuration, the footer is also optional, and it
diff -rN -u old-darcsweb/darcsweb.cgi new-darcsweb/darcsweb.cgi
--- old-darcsweb/darcsweb.cgi 2013-07-29 10:13:25.000000000 +0000
+++ new-darcsweb/darcsweb.cgi 2013-07-29 10:13:25.000000000 +0000
@@ -88,42 +88,24 @@
# I _hate_ this.
def fixu8(s):
- openpos = s.find('[_')
- if openpos < 0:
- # small optimization to avoid the conversion to utf8 and
- # entering the loop
- if type(s) == unicode:
- # workaround for python < 2.4
- return s.encode('utf8')
- else:
- return s.decode(config.repoencoding).encode('utf8')
+ """Calls _fixu8(), which does the real work, line by line. Otherwise
+ we choose the wrong encoding for big buffers and end up messing
+ output."""
+ n = []
+ for i in s.split('\n'):
+ n.append(_fixu8(i))
+ return string.join(n, '\n')
+
+def _fixu8(s):
+ if type(s) == unicode:
+ return s.encode('utf8', 'replace')
+ for e in config.repoencoding:
+ try:
+ return s.decode(e).encode('utf8', 'replace')
+ except UnicodeDecodeError:
+ pass
+ raise 'DecodingError', config.repoencoding
- s = s.encode(config.repoencoding).decode('raw_unicode_escape')
- while openpos >= 0:
- closepos = s.find('_]', openpos)
- if closepos < 0:
- # not closed, probably just luck
- break
-
- # middle should be something like 'c3', so we get it by
- # removing the first three characters ("[_\")
- middle = s[openpos + 3:closepos]
- if len(middle) == 2:
- # now we turn middle into the character "\xc3"
- char = chr(int(middle, 16))
-
- # finally, replace s with our new improved string, and
- # repeat the ugly procedure
- char = char.decode(config.repoencoding)
- mn = '[_\\' + middle + '_]'
- s = s.replace(mn, char, 1)
- openpos = s.find('[_', openpos + 1)
-
- if config.repoencoding != 'utf8':
- s = s.encode('utf8')
- else:
- s = s.encode('raw_unicode_escape', 'replace')
- return s
def escape(s):
s = xml_escape(s)
@@ -585,7 +567,7 @@
"""Runs darcs on the repodir with the given params, return a file
object with its output."""
os.chdir(config.repodir)
- cmd = config.darcspath + "darcs " + params
+ cmd = 'DARCS_DONT_ESCAPE_8BIT=1 ' + config.darcspath + "darcs " + params
inf, outf = os.popen4(cmd, 't')
darcs_runs.append(params)
return outf
@@ -643,6 +625,24 @@
return i
return ''
+class XmlInputWrapper:
+ def __init__(self, fd):
+ self.fd = fd
+ self.times = 0
+ self._read = self.read
+
+ def read(self, *args, **kwargs):
+ self.times += 1
+ if self.times == 1:
+ return '<?xml version="1.0" encoding="utf-8"?>\n'
+ s = self.fd.read(*args, **kwargs)
+ if not s:
+ return s
+ return fixu8(s)
+
+ def close(self, *args, **kwargs):
+ return self.fd.close(*args, **kwargs)
+
# patch parsing, we get them through "darcs changes --xml-output"
class BuildPatchList(xml.sax.handler.ContentHandler):
@@ -812,7 +812,7 @@
# get the xml output and parse it
xmlf = run_darcs("changes --xml-output " + params)
- parser.parse(xmlf)
+ parser.parse(XmlInputWrapper(xmlf))
xmlf.close()
return handler
@@ -1009,7 +1009,7 @@
def print_diff(dsrc):
for l in dsrc:
- l = l.decode(config.repoencoding, 'replace').encode('utf-8')
+ l = fixu8(l)
# remove the trailing newline
if len(l) > 1:
@@ -1035,7 +1035,7 @@
def print_darcs_diff(dsrc):
for l in dsrc:
- l = l.decode(config.repoencoding, 'replace').encode('utf-8')
+ l = fixu8(l)
if not l.startswith(" "):
# comments and normal stuff
@@ -2185,7 +2185,13 @@
repodir = rdir
repodesc = desc
repourl = url
- repoencoding = c.repoencoding
+
+ # repoencoding must be a tuple
+ if c.repoencoding is str:
+ repoencoding = (c.repoencoding, )
+ else:
+ repoencoding = c.repoencoding
+
if 'footer' in dir(c):
footer = c.footer
config.__setattr__(name, tmp_config)