Hi Sean,

On Sat, Jun 19, 2010 at 12:36:09PM +0200, sean finney wrote:
> Package: python-debian
> Version: 0.1.16
> Severity: normal
> 
> hi,
> 
> attached is a patch that does something similar to what i was suggesting
> earlier.

Thanks for your patch.  Sorry it took me so long to get to it - I've
been moving and getting settled into a new job.

I wound up going with a different solution, using the python-chardet
package to try to auto-detect the character encoding if the default
decode failed.  This way the user usually won't have to worry at all
about encoding issues.  It works for me on the etch Sources file.

I've attached the patch I committed, and I'm about to upload.  Feel free
to reopen the bug if it doesn't solve your problem!

-- 
John Wright <j...@debian.org>
From c888992ee90e0e4cff61fedce395b7c40a0f2e8d Mon Sep 17 00:00:00 2001
From: John Wright <j...@debian.org>
Date: Sat, 24 Jul 2010 23:57:07 -0700
Subject: [PATCH] deb822: Use chardet to try to detect character encodings as necessary

This is only used when the specified encoding doesn't work.  It's mainly
useful for files containing multiple deb822 paragraphs with mixed
encodings, like etch's Sources file.

To make this consistent, the pure Python parser now just stores the raw
string, putting off the unicode conversion until the user tries to get
an item.
---
 debian/changelog                  |    4 ++++
 debian/control                    |    4 ++--
 lib/debian/deb822.py              |   37 +++++++++++++++++++++++++++----------
 tests/test_Sources.mixed_encoding |   34 ++++++++++++++++++++++++++++++++++
 tests/test_deb822.py              |   24 ++++++++++++++++++++++++
 5 files changed, 91 insertions(+), 12 deletions(-)
 create mode 100644 tests/test_Sources.mixed_encoding

diff --git a/debian/changelog b/debian/changelog
index 9042957..2637242 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -5,6 +5,10 @@ python-debian (0.1.17) UNRELEASED; urgency=low
 
   [ John Wright ]
   * test_deb822.py: Make test_gpg_info more robust (Closes: #582878)
+  * deb822: Use chardet to try to detect character encodings as necessary
+    - This is only used when the specified encoding doesn't work.  It's mainly
+      useful for files containing multiple deb822 paragraphs with mixed
+      encodings, like etch's Sources file. (Closes: #586021)
 
  -- John Wright <j...@debian.org>  Fri, 25 Jun 2010 11:20:22 -0600
 
diff --git a/debian/control b/debian/control
index 3315ae7..d52ab3e 100644
--- a/debian/control
+++ b/debian/control
@@ -8,7 +8,7 @@ Uploaders: Adeodato Simó <d...@net.com.org.es>,
  Reinhard Tartler <siret...@tauware.de>,
  Stefano Zacchiroli <z...@debian.org>,
  John Wright <j...@debian.org>
-Build-Depends: debhelper (>= 5.0.37.2), python, python-setuptools
+Build-Depends: debhelper (>= 5.0.37.2), python, python-setuptools, python-chardet
 Build-Depends-Indep: python-support (>= 0.3)
 Standards-Version: 3.8.4
 Vcs-Browser: http://git.debian.org/?p=pkg-python-debian/python-debian.git
@@ -16,7 +16,7 @@ Vcs-Git: git://git.debian.org/git/pkg-python-debian/python-debian.git
 
 Package: python-debian
 Architecture: all
-Depends: ${python:Depends}, ${misc:Depends}
+Depends: ${python:Depends}, ${misc:Depends}, python-chardet
 Recommends: python-apt
 Suggests: gpgv
 Provides: python-deb822
diff --git a/lib/debian/deb822.py b/lib/debian/deb822.py
index 68af3d2..1a21e62 100644
--- a/lib/debian/deb822.py
+++ b/lib/debian/deb822.py
@@ -4,7 +4,7 @@
 # (.changes, .dsc, Packages, Sources, etc)
 #
 # Copyright (C) 2005-2006  dann frazier <da...@dannf.org>
-# Copyright (C) 2006-2008  John Wright <j...@johnwright.org>
+# Copyright (C) 2006-2010  John Wright <j...@johnwright.org>
 # Copyright (C) 2006       Adeodato Simó <d...@net.com.org.es>
 # Copyright (C) 2008       Stefano Zacchiroli <z...@upsilon.cc>
 #
@@ -30,10 +30,13 @@ try:
 except ImportError:
     _have_apt_pkg = False
 
+import chardet
 import new
 import re
 import string
 import sys
+import warnings
+
 import StringIO
 import UserDict
 
@@ -176,7 +179,25 @@ class Deb822Dict(object, UserDict.DictMixin):
 
         if isinstance(value, str):
             # Always return unicode objects instead of strings
-            value = value.decode(self.encoding)
+            try:
+                value = value.decode(self.encoding)
+            except UnicodeDecodeError, e:
+                # Evidently, the value wasn't encoded with the encoding the
+                # user specified.  Try detecting it.
+                warnings.warn('decoding from %s failed; attempting to detect '
+                              'the true encoding' % self.encoding,
+                              UnicodeWarning)
+                result = chardet.detect(value)
+                try:
+                    value = value.decode(result['encoding'])
+                except UnicodeDecodeError:
+                    raise e
+                else:
+                    # Assume the rest of the paragraph is in this encoding as
+                    # well (there's no sense in repeating this exercise for
+                    # every field).
+                    self.encoding = result['encoding']
+
         return value
 
     def __delitem__(self, key):
@@ -306,33 +327,29 @@ class Deb822(Deb822Dict):
         curkey = None
         content = ""
         for line in self.gpg_stripped_paragraph(sequence):
-            if isinstance(line, str):
-                line = line.decode(self.encoding)
             m = single.match(line)
             if m:
                 if curkey:
-                    self[curkey] += content
+                    self[curkey] = content
 
                 if not wanted_field(m.group('key')):
                     curkey = None
                     continue
 
                 curkey = m.group('key')
-                self[curkey] = m.group('data')
-                content = ""
+                content = m.group('data')
                 continue
 
             m = multi.match(line)
             if m:
                 if curkey:
-                    self[curkey] += content
+                    self[curkey] = content
 
                 if not wanted_field(m.group('key')):
                     curkey = None
                     continue
 
                 curkey = m.group('key')
-                self[curkey] = ""
                 content = ""
                 continue
 
@@ -342,7 +359,7 @@ class Deb822(Deb822Dict):
                 continue
 
         if curkey:
-            self[curkey] += content
+            self[curkey] = content
 
     def __str__(self):
         return self.dump()
diff --git a/tests/test_Sources.mixed_encoding b/tests/test_Sources.mixed_encoding
new file mode 100644
index 0000000..af2f3ca
--- /dev/null
+++ b/tests/test_Sources.mixed_encoding
@@ -0,0 +1,34 @@
+Package: amarok
+Binary: amarok, amarok-engines, amarok-xine
+Version: 1.4.4-4etch1
+Priority: optional
+Section: kde
+Maintainer: Adeodato Simó <d...@net.com.org.es>
+Build-Depends: cdbs, debhelper (>= 5), quilt, bzip2, automake1.9, libtool, kdelibs4-dev, kdemultimedia-dev, kdebase-dev, libxine-dev, libtag1-dev (>> 1.4), libsqlite3-dev, libtunepimp3-dev, libmysqlclient15-dev, libpq-dev, xmms-dev, libvisual-0.4-dev, libsdl1.2-dev, libifp-dev, libusb-dev, libgpod-dev, libnjb-dev, ruby, ruby1.8-dev, dpkg-dev (>= 1.13.19)
+Architecture: any
+Standards-Version: 3.7.2
+Format: 1.0
+Directory: pool/main/a/amarok
+Files:
+ f8e80af55fbd8386e6b13b0b12d798f4 986 amarok_1.4.4-4etch1.dsc
+ 0adbbd8373da2198b80e509618a2dab9 17628566 amarok_1.4.4.orig.tar.gz
+ c29b0538c033ededacc6d31339d17700 42402 amarok_1.4.4-4etch1.diff.gz
+Uploaders: Ana Beatriz Guerrero Lopez <a...@debian.org>
+
+Package: texinfo
+Binary: texinfo, info
+Version: 4.8.dfsg.1-4
+Priority: important
+Section: doc
+Maintainer: Norbert Preining <prein...@debian.org>
+Build-Depends: debhelper (>= 5), dpatch, libncurses5-dev | libncurses-dev, gettext
+Architecture: any
+Standards-Version: 3.7.2
+Format: 1.0
+Directory: pool/main/t/texinfo
+Files:
+ 2c233d2bf6627eac32deb9bb87726ea1 680 texinfo_4.8.dfsg.1-4.dsc
+ 614273ac8568a25926aae374cd9a6683 1926534 texinfo_4.8.dfsg.1.orig.tar.gz
+ e01520524bc114d90a2a1e5eefe71b50 101211 texinfo_4.8.dfsg.1-4.diff.gz
+Uploaders: Frank Küster <fr...@debian.org>
+
diff --git a/tests/test_deb822.py b/tests/test_deb822.py
index f6adcfd..891f4cd 100755
--- a/tests/test_deb822.py
+++ b/tests/test_deb822.py
@@ -21,6 +21,7 @@ import os
 import re
 import sys
 import unittest
+import warnings
 from StringIO import StringIO
 
 sys.path.insert(0, '../lib/debian/')
@@ -702,6 +703,29 @@ Description: python modules to work with Debian-related data formats
         self.assertEqual(utf8_contents, latin1_to_utf8.getvalue())
         self.assertEqual(latin1_contents, utf8_to_latin1.getvalue())
 
+    def test_mixed_encodings(self):
+        """Test that we can handle a simple case of mixed encodings
+
+        In general, this isn't guaranteed to work.  It uses the chardet
+        package, which tries to determine heuristically the encoding of the
+        text given to it.  But as far as I've seen, it's reliable for mixed
+        latin1 and utf-8 in maintainer names in old Sources files...
+        """
+
+        # Avoid spitting out the encoding warning during testing.
+        warnings.filterwarnings(action='ignore', category=UnicodeWarning)
+
+        filename = 'test_Sources.mixed_encoding'
+        for paragraphs in [deb822.Sources.iter_paragraphs(file(filename)),
+                           deb822.Sources.iter_paragraphs(file(filename),
+                                                          use_apt_pkg=False)]:
+            p1 = paragraphs.next()
+            self.assertEqual(p1['maintainer'],
+                             u'Adeodato Simó <d...@net.com.org.es>')
+            p2 = paragraphs.next()
+            self.assertEqual(p2['uploaders'],
+                             u'Frank Küster <fr...@debian.org>')
+
 class TestPkgRelations(unittest.TestCase):
 
     def test_packages(self):
-- 
1.7.1

Reply via email to