Fri Feb 24 00:31:28 UTC 2006 Alberto Bertogli * Rewrite encoding handling. This patch rewrites fixu8() and affects a couple of places where decode() was used directly. It has the benefit of removing the ugly previous function, replacing it with... well, another ugly function, but with a different kind of uglyness. It also supports multiple encodings, so if the first one fails, a second one is tried (and so on). There still are some corner cases with file named in alternative encodings, but should work much better than the older code. diff -rN -u old-darcsweb/config.py.sample new-darcsweb/config.py.sample --- old-darcsweb/config.py.sample 2014-10-30 14:26:56.000000000 +0000 +++ new-darcsweb/config.py.sample 2014-10-30 14:26:56.000000000 +0000 @@ -80,6 +80,9 @@ # like 'utf-8' or 'UTF8') if you expect darcsweb to work properly. # This is because to workaround a bug in darcs we need to do some # codec mangling and it needs special cases for UTF8. + # You can, optionally, specify multiple encodings; they're tried in + # order, and if one fails to decode a string, the next one is tried. + # Example: repoencoding = "utf8", "latin1" repoencoding = "latin1" # as with the base configuration, the footer is also optional, and it diff -rN -u old-darcsweb/darcsweb.cgi new-darcsweb/darcsweb.cgi --- old-darcsweb/darcsweb.cgi 2014-10-30 14:26:56.000000000 +0000 +++ new-darcsweb/darcsweb.cgi 2014-10-30 14:26:56.000000000 +0000 @@ -88,42 +88,24 @@ # I _hate_ this. def fixu8(s): - openpos = s.find('[_') - if openpos < 0: - # small optimization to avoid the conversion to utf8 and - # entering the loop - if type(s) == unicode: - # workaround for python < 2.4 - return s.encode('utf8') - else: - return s.decode(config.repoencoding).encode('utf8') + """Calls _fixu8(), which does the real work, line by line. Otherwise + we choose the wrong encoding for big buffers and end up messing + output.""" + n = [] + for i in s.split('\n'): + n.append(_fixu8(i)) + return string.join(n, '\n') + +def _fixu8(s): + if type(s) == unicode: + return s.encode('utf8', 'replace') + for e in config.repoencoding: + try: + return s.decode(e).encode('utf8', 'replace') + except UnicodeDecodeError: + pass + raise 'DecodingError', config.repoencoding - s = s.encode(config.repoencoding).decode('raw_unicode_escape') - while openpos >= 0: - closepos = s.find('_]', openpos) - if closepos < 0: - # not closed, probably just luck - break - - # middle should be something like 'c3', so we get it by - # removing the first three characters ("[_\") - middle = s[openpos + 3:closepos] - if len(middle) == 2: - # now we turn middle into the character "\xc3" - char = chr(int(middle, 16)) - - # finally, replace s with our new improved string, and - # repeat the ugly procedure - char = char.decode(config.repoencoding) - mn = '[_\\' + middle + '_]' - s = s.replace(mn, char, 1) - openpos = s.find('[_', openpos + 1) - - if config.repoencoding != 'utf8': - s = s.encode('utf8') - else: - s = s.encode('raw_unicode_escape', 'replace') - return s def escape(s): s = xml_escape(s) @@ -585,7 +567,7 @@ """Runs darcs on the repodir with the given params, return a file object with its output.""" os.chdir(config.repodir) - cmd = config.darcspath + "darcs " + params + cmd = 'DARCS_DONT_ESCAPE_8BIT=1 ' + config.darcspath + "darcs " + params inf, outf = os.popen4(cmd, 't') darcs_runs.append(params) return outf @@ -643,6 +625,24 @@ return i return '' +class XmlInputWrapper: + def __init__(self, fd): + self.fd = fd + self.times = 0 + self._read = self.read + + def read(self, *args, **kwargs): + self.times += 1 + if self.times == 1: + return '\n' + s = self.fd.read(*args, **kwargs) + if not s: + return s + return fixu8(s) + + def close(self, *args, **kwargs): + return self.fd.close(*args, **kwargs) + # patch parsing, we get them through "darcs changes --xml-output" class BuildPatchList(xml.sax.handler.ContentHandler): @@ -812,7 +812,7 @@ # get the xml output and parse it xmlf = run_darcs("changes --xml-output " + params) - parser.parse(xmlf) + parser.parse(XmlInputWrapper(xmlf)) xmlf.close() return handler @@ -1009,7 +1009,7 @@ def print_diff(dsrc): for l in dsrc: - l = l.decode(config.repoencoding, 'replace').encode('utf-8') + l = fixu8(l) # remove the trailing newline if len(l) > 1: @@ -1035,7 +1035,7 @@ def print_darcs_diff(dsrc): for l in dsrc: - l = l.decode(config.repoencoding, 'replace').encode('utf-8') + l = fixu8(l) if not l.startswith(" "): # comments and normal stuff @@ -2185,7 +2185,13 @@ repodir = rdir repodesc = desc repourl = url - repoencoding = c.repoencoding + + # repoencoding must be a tuple + if c.repoencoding is str: + repoencoding = (c.repoencoding, ) + else: + repoencoding = c.repoencoding + if 'footer' in dir(c): footer = c.footer config.__setattr__(name, tmp_config)