Fri Feb 24 00:31:28 UTC 2006  Alberto Bertogli <albertogli@telpin.com.ar>
  * Rewrite encoding handling.
  This patch rewrites fixu8() and affects a couple of places where decode() was
  used directly.
  
  It has the benefit of removing the ugly previous function, replacing it
  with...  well, another ugly function, but with a different kind of uglyness.
  
  It also supports multiple encodings, so if the first one fails, a second one
  is tried (and so on).
  
  There still are some corner cases with file named in alternative encodings,
  but should work much better than the older code.
{
hunk ./config.py.sample 83
+	# You can, optionally, specify multiple encodings; they're tried in
+	# order, and if one fails to decode a string, the next one is tried.
+	# Example: repoencoding = "utf8", "latin1"
hunk ./darcsweb.cgi 91
-	openpos = s.find('[_')
-	if openpos < 0:
-		# small optimization to avoid the conversion to utf8 and
-		# entering the loop
-		if type(s) == unicode:
-			# workaround for python < 2.4
-			return s.encode('utf8')
-		else:
-			return s.decode(config.repoencoding).encode('utf8')
-
-	s = s.encode(config.repoencoding).decode('raw_unicode_escape')
-	while openpos >= 0:
-		closepos = s.find('_]', openpos)
-		if closepos < 0:
-			# not closed, probably just luck
-			break
-
-		# middle should be something like 'c3', so we get it by
-		# removing the first three characters ("[_\")
-		middle = s[openpos + 3:closepos]
-		if len(middle) == 2:
-			# now we turn middle into the character "\xc3"
-			char = chr(int(middle, 16))
+	"""Calls _fixu8(), which does the real work, line by line. Otherwise
+	we choose the wrong encoding for big buffers and end up messing
+	output."""
+	n = []
+	for i in s.split('\n'):
+		n.append(_fixu8(i))
+	return string.join(n, '\n')
hunk ./darcsweb.cgi 99
-			# finally, replace s with our new improved string, and
-			# repeat the ugly procedure
-			char = char.decode(config.repoencoding)
-			mn = '[_\\' + middle + '_]'
-			s = s.replace(mn, char, 1)
-		openpos = s.find('[_', openpos + 1)
+def _fixu8(s):
+	if type(s) == unicode:
+		return s.encode('utf8', 'replace')
+	for e in config.repoencoding:
+		try:
+			return s.decode(e).encode('utf8', 'replace')
+		except UnicodeDecodeError:
+			pass
+	raise 'DecodingError', config.repoencoding
hunk ./darcsweb.cgi 109
-	if config.repoencoding != 'utf8':
-		s = s.encode('utf8')
-	else:
-		s = s.encode('raw_unicode_escape', 'replace')
-	return s
hunk ./darcsweb.cgi 570
-	cmd = config.darcspath + "darcs " + params
+	cmd = 'DARCS_DONT_ESCAPE_8BIT=1 ' + config.darcspath + "darcs " + params
hunk ./darcsweb.cgi 628
+class XmlInputWrapper:
+	def __init__(self, fd):
+		self.fd = fd
+		self.times = 0
+		self._read = self.read
+
+	def read(self, *args, **kwargs):
+		self.times += 1
+		if self.times == 1:
+			return '<?xml version="1.0" encoding="utf-8"?>\n'
+		s = self.fd.read(*args, **kwargs)
+		if not s:
+			return s
+		return fixu8(s)
+
+	def close(self, *args, **kwargs):
+		return self.fd.close(*args, **kwargs)
+
hunk ./darcsweb.cgi 815
-	parser.parse(xmlf)
+	parser.parse(XmlInputWrapper(xmlf))
hunk ./darcsweb.cgi 1012
-		l = l.decode(config.repoencoding, 'replace').encode('utf-8')
+		l = fixu8(l)
hunk ./darcsweb.cgi 1038
-		l = l.decode(config.repoencoding, 'replace').encode('utf-8')
+		l = fixu8(l)
hunk ./darcsweb.cgi 2188
-				repoencoding = c.repoencoding
+
+				# repoencoding must be a tuple
+				if c.repoencoding is str:
+					repoencoding = (c.repoencoding, )
+				else:
+					repoencoding = c.repoencoding
+
}