Package: python3-debian Version: 0.1.38 Severity: normal Tags: patch Deb822 modules cannot properly be serialized by the pickle module.
It turns out the dict uses _CaseInsensitiveString which stores a pre-computed hash value of a `str`. However Python randomizes hashes, i.e., one will get a different hash for a _CaseInsensitiveString restored by the pickle module and a newly created _CaseInsensitiveString. As a workaround one can set the environment variable `PYTHONHASHSEED=0`. A short demo program is attached: call deb822pickler.py first, then deb822unpickler.py. I attached patches to just drop the precomputed hash. This makes repeatedly calling `hash()` on the same string a bit slower, but on the other hand calling `hash()` only once is a bit faster. I don't think one usually needs to call `hash()` on the same string repeatedly. Also attached is a patch using `__slots__` to declare the member variable. This makes creating a _CaseInsensitveString and calling hash a bit faster. I've attached a short benchmark script to compare the implementations. On my system I get: Using __slots__: Calling hash() repeatedly: Old implementation: 0.14679651000187732 New implementation: 0.16284992700093426 Creating string and calling hash once: Old implementation: 0.6289306499966187 New implementation: 0.5861582299985457 Not using __slots__: Calling hash() repeatedly: Old implementation: 0.15407300100196153 New implementation: 0.1629600290034432 Creating string and calling hash once: Old implementation: 0.6767919499980053 New implementation: 0.614397427008953 Ansgar
#! /usr/bin/python3 import pickle from debian.deb822 import Deb822 d = Deb822("Field: value") with open("db.pickle", "wb") as fh: pickle.dump(d, fh)
#! /usr/bin/python3 import pickle from debian.deb822 import Deb822 with open("db.pickle", "rb") as fh: d = pickle.load(fh) print(d)
#! /usr/bin/python3 import timeit class _OldCaseInsensitiveString(str): #str_lower = '' #str_lower_hash = 0 __slots__ = 'str_lower', 'str_lower_hash' def __new__(cls, str_): # type: ignore s = str.__new__(cls, str_) # type: ignore s.str_lower = str_.lower() s.str_lower_hash = hash(s.str_lower) return s def __hash__(self): # type: () -> int return self.str_lower_hash class _CaseInsensitiveString(str): #str_lower = '' __slots__ = 'str_lower' def __new__(cls, str_): # type: ignore s = str.__new__(cls, str_) # type: ignore s.str_lower = str_.lower() return s def __hash__(self): # type: () -> int return hash(self.str_lower) s_old = _OldCaseInsensitiveString("Field") t_old = timeit.timeit('hash(s_old)', globals=globals()) s_new = _CaseInsensitiveString("Field") t_new = timeit.timeit('hash(s_new)', globals=globals()) print("Calling hash() repeatedly:") print("Old implementation: ", t_old) print("New implementation: ", t_new) t_old = timeit.timeit('hash(_OldCaseInsensitiveString("Field"))', globals=globals()) t_new = timeit.timeit('hash(_CaseInsensitiveString("Field"))', globals=globals()) print("Creating string and calling hash once:") print("Old implementation: ", t_old) print("New implementation: ", t_new)
>From 54165117266fcf419bc572e30bf66b4249074328 Mon Sep 17 00:00:00 2001 From: Ansgar <ans...@debian.org> Date: Thu, 26 Nov 2020 16:42:16 +0100 Subject: [PATCH 1/2] _CaseInsensitiveString: do not precompute hash This allows the "pickle" module to work correctly: hashing a str will give different results in different invocations of the Python interpreter, so the old hash value should not be restored. Not storing the hash value at all works for this. --- lib/debian/deb822.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/debian/deb822.py b/lib/debian/deb822.py index 9481a9c..d07abbb 100644 --- a/lib/debian/deb822.py +++ b/lib/debian/deb822.py @@ -2513,17 +2513,15 @@ class _CaseInsensitiveString(str): # Fake definitions because mypy doesn't find them in __new__ ## CRUFT # https://github.com/python/mypy/issues/1021 str_lower = '' - str_lower_hash = 0 def __new__(cls, str_): # type: ignore s = str.__new__(cls, str_) # type: ignore s.str_lower = str_.lower() - s.str_lower_hash = hash(s.str_lower) return s def __hash__(self): # type: () -> int - return self.str_lower_hash + return hash(self.str_lower) def __eq__(self, other): # type: (Any) -> Any -- 2.29.2
>From a547de6d8acee22eeb6f73ab570de76e80f21e6c Mon Sep 17 00:00:00 2001 From: Ansgar <ans...@debian.org> Date: Thu, 26 Nov 2020 16:46:49 +0100 Subject: [PATCH 2/2] _CaseInsensitiveString: declare data member using `__slots__` --- lib/debian/deb822.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/debian/deb822.py b/lib/debian/deb822.py index d07abbb..748383e 100644 --- a/lib/debian/deb822.py +++ b/lib/debian/deb822.py @@ -2510,9 +2510,7 @@ class Removals(Deb822): class _CaseInsensitiveString(str): """Case insensitive string. """ - # Fake definitions because mypy doesn't find them in __new__ ## CRUFT - # https://github.com/python/mypy/issues/1021 - str_lower = '' + __slots__ = 'str_lower' def __new__(cls, str_): # type: ignore s = str.__new__(cls, str_) # type: ignore -- 2.29.2