New submission from Skip Montanaro:
The topic of avoiding string copies in certain string methods came up in
the
ChiPy list:
http://mail.python.org/pipermail/chicago/2007-December/002975.html.
The attached patch modifies the split and rsplit implementations to
avoid
making a copy of self when the split fails to find anything to split on:
>>> s = "abc def"
>>> x = s.split(';')
>>> x[0] is s
True
>>> y = s.rsplit('-')
>>> y[0] is s
True
>>> t = "abcdef"
>>> x = t.split()
>>> x[0] is t
True
>>> y = t.rsplit()
>>> y[0] is t
True
All tests pass. Given that this is just a small optimization I
don't believe any changes to the docs or the existing tests are
necessary.
----------
components: Interpreter Core
files: string-split.patch
keywords: patch
messages: 58081
nosy: skip.montanaro
priority: normal
severity: normal
status: open
title: Avoid string copy when split char doesn't match
type: rfe
versions: Python 2.6
Added file: http://bugs.python.org/file8851/string-split.patch
__________________________________
Tracker <[EMAIL PROTECTED]>
<http://bugs.python.org/issue1538>
__________________________________
*** /tmp/skip/ediffWFAoxm Sun Dec 2 01:28:32 2007
--- /Users/skip/src/python/trunk/Objects/stringobject.c Sun Dec 2 01:27:56 2007
***************
*** 1403,1410 ****
#define RSKIP_NONSPACE(s, i) { while (i>=0 &&
!isspace(Py_CHARMASK(s[i]))) i--; }
Py_LOCAL_INLINE(PyObject *)
! split_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxsplit)
{
Py_ssize_t i, j, count=0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
--- 1403,1411 ----
#define RSKIP_NONSPACE(s, i) { while (i>=0 &&
!isspace(Py_CHARMASK(s[i]))) i--; }
Py_LOCAL_INLINE(PyObject *)
! split_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
{
+ const char *s = PyString_AS_STRING(self);
Py_ssize_t i, j, count=0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
***************
*** 1419,1424 ****
--- 1420,1432 ----
if (i==len) break;
j = i; i++;
SKIP_NONSPACE(s, i, len);
+ if (j == 0 && i == len) {
+ /* No whitespace in self, so just use it as list[0] */
+ Py_INCREF(self);
+ PyList_SET_ITEM(list, 0, (PyObject *)self);
+ count++;
+ break;
+ }
SPLIT_ADD(s, j, i);
}
***************
*** 1437,1444 ****
}
Py_LOCAL_INLINE(PyObject *)
! split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
{
register Py_ssize_t i, j, count=0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
--- 1445,1453 ----
}
Py_LOCAL_INLINE(PyObject *)
! split_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
{
+ const char *s = PyString_AS_STRING(self);
register Py_ssize_t i, j, count=0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
***************
*** 1457,1463 ****
}
}
}
! if (i <= len) {
SPLIT_ADD(s, i, len);
}
FIX_PREALLOC_SIZE(list);
--- 1466,1478 ----
}
}
}
! if (i == 0 && count == 0) {
! /* ch not in self, so just use self as list[0] */
! Py_INCREF(self);
! PyList_SET_ITEM(list, 0, (PyObject *)self);
! count++;
! }
! else if (i <= len) {
SPLIT_ADD(s, i, len);
}
FIX_PREALLOC_SIZE(list);
***************
*** 1492,1498 ****
if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX;
if (subobj == Py_None)
! return split_whitespace(s, len, maxsplit);
if (PyString_Check(subobj)) {
sub = PyString_AS_STRING(subobj);
n = PyString_GET_SIZE(subobj);
--- 1507,1513 ----
if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX;
if (subobj == Py_None)
! return split_whitespace(self, len, maxsplit);
if (PyString_Check(subobj)) {
sub = PyString_AS_STRING(subobj);
n = PyString_GET_SIZE(subobj);
***************
*** 1509,1515 ****
return NULL;
}
else if (n == 1)
! return split_char(s, len, sub[0], maxsplit);
list = PyList_New(PREALLOC_SIZE(maxsplit));
if (list == NULL)
--- 1524,1530 ----
return NULL;
}
else if (n == 1)
! return split_char(self, len, sub[0], maxsplit);
list = PyList_New(PREALLOC_SIZE(maxsplit));
if (list == NULL)
***************
*** 1609,1616 ****
}
Py_LOCAL_INLINE(PyObject *)
! rsplit_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxsplit)
{
Py_ssize_t i, j, count=0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
--- 1624,1632 ----
}
Py_LOCAL_INLINE(PyObject *)
! rsplit_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
{
+ const char *s = PyString_AS_STRING(self);
Py_ssize_t i, j, count=0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
***************
*** 1625,1630 ****
--- 1641,1653 ----
if (i<0) break;
j = i; i--;
RSKIP_NONSPACE(s, i);
+ if (j == len-1 && i < 0) {
+ /* No whitespace in self, so just use it as list[0] */
+ Py_INCREF(self);
+ PyList_SET_ITEM(list, 0, (PyObject *)self);
+ count++;
+ break;
+ }
SPLIT_ADD(s, i + 1, j + 1);
}
if (i >= 0) {
***************
*** 1645,1652 ****
}
Py_LOCAL_INLINE(PyObject *)
! rsplit_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
{
register Py_ssize_t i, j, count=0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
--- 1668,1676 ----
}
Py_LOCAL_INLINE(PyObject *)
! rsplit_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t
maxcount)
{
+ const char *s = PyString_AS_STRING(self);
register Py_ssize_t i, j, count=0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
***************
*** 1664,1670 ****
}
}
}
! if (j >= -1) {
SPLIT_ADD(s, 0, j + 1);
}
FIX_PREALLOC_SIZE(list);
--- 1688,1700 ----
}
}
}
! if (i < 0 && count == 0) {
! /* ch not in self, so just use self as list[0] */
! Py_INCREF(self);
! PyList_SET_ITEM(list, 0, (PyObject *)self);
! count++;
! }
! else if (j >= -1) {
SPLIT_ADD(s, 0, j + 1);
}
FIX_PREALLOC_SIZE(list);
***************
*** 1691,1697 ****
{
Py_ssize_t len = PyString_GET_SIZE(self), n, i, j;
Py_ssize_t maxsplit = -1, count=0;
! const char *s = PyString_AS_STRING(self), *sub;
PyObject *list, *str, *subobj = Py_None;
if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
--- 1721,1727 ----
{
Py_ssize_t len = PyString_GET_SIZE(self), n, i, j;
Py_ssize_t maxsplit = -1, count=0;
! const char *s, *sub;
PyObject *list, *str, *subobj = Py_None;
if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
***************
*** 1699,1705 ****
if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX;
if (subobj == Py_None)
! return rsplit_whitespace(s, len, maxsplit);
if (PyString_Check(subobj)) {
sub = PyString_AS_STRING(subobj);
n = PyString_GET_SIZE(subobj);
--- 1729,1735 ----
if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX;
if (subobj == Py_None)
! return rsplit_whitespace(self, len, maxsplit);
if (PyString_Check(subobj)) {
sub = PyString_AS_STRING(subobj);
n = PyString_GET_SIZE(subobj);
***************
*** 1716,1722 ****
return NULL;
}
else if (n == 1)
! return rsplit_char(s, len, sub[0], maxsplit);
list = PyList_New(PREALLOC_SIZE(maxsplit));
if (list == NULL)
--- 1746,1752 ----
return NULL;
}
else if (n == 1)
! return rsplit_char(self, len, sub[0], maxsplit);
list = PyList_New(PREALLOC_SIZE(maxsplit));
if (list == NULL)
***************
*** 1725,1730 ****
--- 1755,1761 ----
j = len;
i = j - n;
+ s = PyString_AS_STRING(self);
while ( (i >= 0) && (maxsplit-- > 0) ) {
for (; i>=0; i--) {
if (Py_STRING_MATCH(s, i, sub, n)) {
_______________________________________________
Python-bugs-list mailing list
Unsubscribe:
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com