This is an automated email from the ASF dual-hosted git repository.
timsaucer pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-python.git
The following commit(s) were added to refs/heads/main by this push:
new acd9a8dc Complete doc string examples for functions.py (#1435)
acd9a8dc is described below
commit acd9a8dcdd1015497835ae3c9a49e4bf5961d719
Author: Nick <[email protected]>
AuthorDate: Fri Mar 27 14:19:18 2026 -0400
Complete doc string examples for functions.py (#1435)
* Verify all non-alias functions have doc string
* MNove all alias for statements to see also blocks and confirm no examples
* Fix google doc style for all examples
* Remove builtins use
* Add coverage for optional filter
* Cover optional argument examples for window and value functions
* Cover optional arguments for scalar functions
* Cover array and aggregation functions
* Make examples different
* Make format more consistent
* Remove duplicated df definition
---
python/datafusion/functions.py | 2048 ++++++++++++++++++++++++++--------------
1 file changed, 1353 insertions(+), 695 deletions(-)
diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py
index e85d710e..f062cbfc 100644
--- a/python/datafusion/functions.py
+++ b/python/datafusion/functions.py
@@ -386,8 +386,6 @@ def list_to_string(expr: Expr, delimiter: Expr) -> Expr:
def list_join(expr: Expr, delimiter: Expr) -> Expr:
"""Converts each element to its text representation.
- This is an alias for :py:func:`array_to_string`.
-
See Also:
This is an alias for :py:func:`array_to_string`.
"""
@@ -407,6 +405,15 @@ def in_list(arg: Expr, values: list[Expr], negated: bool =
False) -> Expr:
... )
>>> result.collect_column("in").to_pylist()
[True, False, True]
+
+ >>> result = df.select(
+ ... dfn.functions.in_list(
+ ... dfn.col("a"), [dfn.lit(1), dfn.lit(3)],
+ ... negated=True,
+ ... ).alias("not_in")
+ ... )
+ >>> result.collect_column("not_in").to_pylist()
+ [False, True, False]
"""
values = [v.expr for v in values]
return Expr(f.in_list(arg.expr, values, negated))
@@ -468,9 +475,15 @@ def order_by(expr: Expr, ascending: bool = True,
nulls_first: bool = True) -> So
"""Creates a new sort expression.
Examples:
- >>> sort_expr = dfn.functions.order_by(dfn.col("a"), ascending=False)
+ >>> sort_expr = dfn.functions.order_by(
+ ... dfn.col("a"), ascending=False)
>>> sort_expr.ascending()
False
+
+ >>> sort_expr = dfn.functions.order_by(
+ ... dfn.col("a"), ascending=True, nulls_first=False)
+ >>> sort_expr.nulls_first()
+ False
"""
return SortExpr(expr, ascending=ascending, nulls_first=nulls_first)
@@ -486,10 +499,23 @@ def alias(expr: Expr, name: str, metadata: dict[str, str]
| None = None) -> Expr
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1, 2]})
- >>> df.select(
- ... dfn.functions.alias(dfn.col("a"), "b")
- ... ).collect_column("b")[0].as_py()
+ >>> result = df.select(
+ ... dfn.functions.alias(
+ ... dfn.col("a"), "b"
+ ... )
+ ... )
+ >>> result.collect_column("b")[0].as_py()
1
+
+ >>> result = df.select(
+ ... dfn.functions.alias(
+ ... dfn.col("a"), "b", metadata={"info": "test"}
+ ... )
+ ... )
+ >>> result.schema()
+ b: int64
+ -- field metadata --
+ info: 'test'
"""
return Expr(f.alias(expr.expr, name, metadata))
@@ -520,9 +546,18 @@ def count_star(filter: Expr | None = None) -> Expr:
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
- >>> result = df.aggregate([],
[dfn.functions.count_star().alias("cnt")])
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.count_star(
+ ... ).alias("cnt")])
>>> result.collect_column("cnt")[0].as_py()
3
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.count_star(
+ ... filter=dfn.col("a") > dfn.lit(1)
+ ... ).alias("cnt")])
+ >>> result.collect_column("cnt")[0].as_py()
+ 2
"""
return count(Expr.literal(1), filter=filter)
@@ -652,12 +687,11 @@ def ascii(arg: Expr) -> Expr:
"""Returns the numeric code of the first character of the argument.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["a","b","c"]})
- >>> ascii_df = df.select(dfn.functions.ascii(dfn.col("a")).alias("ascii"))
- >>> ascii_df.collect_column("ascii")[0].as_py()
- 97
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["a","b","c"]})
+ >>> ascii_df =
df.select(dfn.functions.ascii(dfn.col("a")).alias("ascii"))
+ >>> ascii_df.collect_column("ascii")[0].as_py()
+ 97
"""
return Expr(f.ascii(arg.expr))
@@ -732,12 +766,11 @@ def bit_length(arg: Expr) -> Expr:
"""Returns the number of bits in the string argument.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["a","b","c"]})
- >>> bit_df =
df.select(dfn.functions.bit_length(dfn.col("a")).alias("bit_len"))
- >>> bit_df.collect_column("bit_len")[0].as_py()
- 8
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["a","b","c"]})
+ >>> bit_df =
df.select(dfn.functions.bit_length(dfn.col("a")).alias("bit_len"))
+ >>> bit_df.collect_column("bit_len")[0].as_py()
+ 8
"""
return Expr(f.bit_length(arg.expr))
@@ -746,12 +779,11 @@ def btrim(arg: Expr) -> Expr:
"""Removes all characters, spaces by default, from both sides of a string.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [" a "]})
- >>> trim_df = df.select(dfn.functions.btrim(dfn.col("a")).alias("trimmed"))
- >>> trim_df.collect_column("trimmed")[0].as_py()
- 'a'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [" a "]})
+ >>> trim_df =
df.select(dfn.functions.btrim(dfn.col("a")).alias("trimmed"))
+ >>> trim_df.collect_column("trimmed")[0].as_py()
+ 'a'
"""
return Expr(f.btrim(arg.expr))
@@ -786,13 +818,12 @@ def character_length(arg: Expr) -> Expr:
"""Returns the number of characters in the argument.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["abc","b","c"]})
- >>> char_len_df = df.select(
- ... dfn.functions.character_length(dfn.col("a")).alias("char_len"))
- >>> char_len_df.collect_column("char_len")[0].as_py()
- 3
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["abc","b","c"]})
+ >>> char_len_df = df.select(
+ ... dfn.functions.character_length(dfn.col("a")).alias("char_len"))
+ >>> char_len_df.collect_column("char_len")[0].as_py()
+ 3
"""
return Expr(f.character_length(arg.expr))
@@ -801,12 +832,11 @@ def length(string: Expr) -> Expr:
"""The number of characters in the ``string``.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["hello"]})
- >>> result = df.select(dfn.functions.length(dfn.col("a")).alias("len"))
- >>> result.collect_column("len")[0].as_py()
- 5
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["hello"]})
+ >>> result = df.select(dfn.functions.length(dfn.col("a")).alias("len"))
+ >>> result.collect_column("len")[0].as_py()
+ 5
"""
return Expr(f.length(string.expr))
@@ -815,12 +845,11 @@ def char_length(string: Expr) -> Expr:
"""The number of characters in the ``string``.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["hello"]})
- >>> result =
df.select(dfn.functions.char_length(dfn.col("a")).alias("len"))
- >>> result.collect_column("len")[0].as_py()
- 5
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["hello"]})
+ >>> result =
df.select(dfn.functions.char_length(dfn.col("a")).alias("len"))
+ >>> result.collect_column("len")[0].as_py()
+ 5
"""
return Expr(f.char_length(string.expr))
@@ -829,12 +858,11 @@ def chr(arg: Expr) -> Expr:
"""Converts the Unicode code point to a UTF8 character.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [65]})
- >>> result = df.select(dfn.functions.chr(dfn.col("a")).alias("chr"))
- >>> result.collect_column("chr")[0].as_py()
- 'A'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [65]})
+ >>> result = df.select(dfn.functions.chr(dfn.col("a")).alias("chr"))
+ >>> result.collect_column("chr")[0].as_py()
+ 'A'
"""
return Expr(f.chr(arg.expr))
@@ -914,13 +942,12 @@ def ends_with(arg: Expr, suffix: Expr) -> Expr:
"""Returns true if the ``string`` ends with the ``suffix``, false
otherwise.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["abc","b","c"]})
- >>> ends_with_df = df.select(
- ... dfn.functions.ends_with(dfn.col("a"),
dfn.lit("c")).alias("ends_with"))
- >>> ends_with_df.collect_column("ends_with")[0].as_py()
- True
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["abc","b","c"]})
+ >>> ends_with_df = df.select(
+ ... dfn.functions.ends_with(dfn.col("a"),
dfn.lit("c")).alias("ends_with"))
+ >>> ends_with_df.collect_column("ends_with")[0].as_py()
+ True
"""
return Expr(f.ends_with(arg.expr, suffix.expr))
@@ -962,13 +989,12 @@ def find_in_set(string: Expr, string_list: Expr) -> Expr:
The string list is a string composed of substrings separated by ``,``
characters.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["b"]})
- >>> result = df.select(
- ... dfn.functions.find_in_set(dfn.col("a"),
dfn.lit("a,b,c")).alias("pos"))
- >>> result.collect_column("pos")[0].as_py()
- 2
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["b"]})
+ >>> result = df.select(
+ ... dfn.functions.find_in_set(dfn.col("a"),
dfn.lit("a,b,c")).alias("pos"))
+ >>> result.collect_column("pos")[0].as_py()
+ 2
"""
return Expr(f.find_in_set(string.expr, string_list.expr))
@@ -1008,12 +1034,11 @@ def initcap(string: Expr) -> Expr:
characters to lowercase.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["the cat"]})
- >>> cap_df = df.select(dfn.functions.initcap(dfn.col("a")).alias("cap"))
- >>> cap_df.collect_column("cap")[0].as_py()
- 'The Cat'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["the cat"]})
+ >>> cap_df =
df.select(dfn.functions.initcap(dfn.col("a")).alias("cap"))
+ >>> cap_df.collect_column("cap")[0].as_py()
+ 'The Cat'
"""
return Expr(f.initcap(string.expr))
@@ -1021,7 +1046,8 @@ def initcap(string: Expr) -> Expr:
def instr(string: Expr, substring: Expr) -> Expr:
"""Finds the position from where the ``substring`` matches the ``string``.
- This is an alias for :py:func:`strpos`.
+ See Also:
+ This is an alias for :py:func:`strpos`.
"""
return strpos(string, substring)
@@ -1058,12 +1084,12 @@ def left(string: Expr, n: Expr) -> Expr:
"""Returns the first ``n`` characters in the ``string``.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["the cat"]})
- >>> left_df = df.select(dfn.functions.left(dfn.col("a"),
dfn.lit(3)).alias("left"))
- >>> left_df.collect_column("left")[0].as_py()
- 'the'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["the cat"]})
+ >>> left_df = df.select(
+ ... dfn.functions.left(dfn.col("a"), dfn.lit(3)).alias("left"))
+ >>> left_df.collect_column("left")[0].as_py()
+ 'the'
"""
return Expr(f.left(string.expr, n.expr))
@@ -1072,13 +1098,12 @@ def levenshtein(string1: Expr, string2: Expr) -> Expr:
"""Returns the Levenshtein distance between the two given strings.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["kitten"]})
- >>> result = df.select(
- ... dfn.functions.levenshtein(dfn.col("a"),
dfn.lit("sitting")).alias("d"))
- >>> result.collect_column("d")[0].as_py()
- 3
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["kitten"]})
+ >>> result = df.select(
+ ... dfn.functions.levenshtein(dfn.col("a"),
dfn.lit("sitting")).alias("d"))
+ >>> result.collect_column("d")[0].as_py()
+ 3
"""
return Expr(f.levenshtein(string1.expr, string2.expr))
@@ -1141,12 +1166,11 @@ def lower(arg: Expr) -> Expr:
"""Converts a string to lowercase.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["THE CaT"]})
- >>> lower_df = df.select(dfn.functions.lower(dfn.col("a")).alias("lower"))
- >>> lower_df.collect_column("lower")[0].as_py()
- 'the cat'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["THE CaT"]})
+ >>> lower_df =
df.select(dfn.functions.lower(dfn.col("a")).alias("lower"))
+ >>> lower_df.collect_column("lower")[0].as_py()
+ 'the cat'
"""
return Expr(f.lower(arg.expr))
@@ -1159,14 +1183,23 @@ def lpad(string: Expr, count: Expr, characters: Expr |
None = None) -> Expr:
truncated (on the right).
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["the cat", "a hat"]})
- >>> lpad_df = df.select(dfn.functions.lpad(dfn.col("a"),
dfn.lit(6)).alias("lpad"))
- >>> lpad_df.collect_column("lpad")[0].as_py()
- 'the ca'
- >>> lpad_df.collect_column("lpad")[1].as_py()
- ' a hat'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["the cat", "a hat"]})
+ >>> lpad_df = df.select(
+ ... dfn.functions.lpad(
+ ... dfn.col("a"), dfn.lit(6)
+ ... ).alias("lpad"))
+ >>> lpad_df.collect_column("lpad")[0].as_py()
+ 'the ca'
+ >>> lpad_df.collect_column("lpad")[1].as_py()
+ ' a hat'
+
+ >>> result = df.select(
+ ... dfn.functions.lpad(
+ ... dfn.col("a"), dfn.lit(10), characters=dfn.lit(".")
+ ... ).alias("lpad"))
+ >>> result.collect_column("lpad")[0].as_py()
+ '...the cat'
"""
characters = characters if characters is not None else Expr.literal(" ")
return Expr(f.lpad(string.expr, count.expr, characters.expr))
@@ -1176,12 +1209,11 @@ def ltrim(arg: Expr) -> Expr:
"""Removes all characters, spaces by default, from the beginning of a
string.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [" a "]})
- >>> trim_df = df.select(dfn.functions.ltrim(dfn.col("a")).alias("trimmed"))
- >>> trim_df.collect_column("trimmed")[0].as_py()
- 'a '
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [" a "]})
+ >>> trim_df =
df.select(dfn.functions.ltrim(dfn.col("a")).alias("trimmed"))
+ >>> trim_df.collect_column("trimmed")[0].as_py()
+ 'a '
"""
return Expr(f.ltrim(arg.expr))
@@ -1236,12 +1268,11 @@ def octet_length(arg: Expr) -> Expr:
"""Returns the number of bytes of a string.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["hello"]})
- >>> result =
df.select(dfn.functions.octet_length(dfn.col("a")).alias("len"))
- >>> result.collect_column("len")[0].as_py()
- 5
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["hello"]})
+ >>> result =
df.select(dfn.functions.octet_length(dfn.col("a")).alias("len"))
+ >>> result.collect_column("len")[0].as_py()
+ 5
"""
return Expr(f.octet_length(arg.expr))
@@ -1255,14 +1286,13 @@ def overlay(
extends for ``length`` characters with new substring.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["abcdef"]})
- >>> result = df.select(
- ... dfn.functions.overlay(dfn.col("a"), dfn.lit("XY"), dfn.lit(3),
- ... dfn.lit(2)).alias("o"))
- >>> result.collect_column("o")[0].as_py()
- 'abXYef'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["abcdef"]})
+ >>> result = df.select(
+ ... dfn.functions.overlay(dfn.col("a"), dfn.lit("XY"), dfn.lit(3),
+ ... dfn.lit(2)).alias("o"))
+ >>> result.collect_column("o")[0].as_py()
+ 'abXYef'
"""
if length is None:
return Expr(f.overlay(string.expr, substring.expr, start.expr))
@@ -1275,14 +1305,12 @@ def pi() -> Expr:
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1]})
- >>> import builtins
+ >>> from math import pi
>>> result = df.select(
... dfn.functions.pi().alias("pi")
... )
- >>> builtins.round(
- ... result.collect_column("pi")[0].as_py(), 5
- ... )
- 3.14159
+ >>> result.collect_column("pi")[0].as_py() == pi
+ True
"""
return Expr(f.pi())
@@ -1290,7 +1318,8 @@ def pi() -> Expr:
def position(string: Expr, substring: Expr) -> Expr:
"""Finds the position from where the ``substring`` matches the ``string``.
- This is an alias for :py:func:`strpos`.
+ See Also:
+ This is an alias for :py:func:`strpos`.
"""
return strpos(string, substring)
@@ -1313,7 +1342,8 @@ def power(base: Expr, exponent: Expr) -> Expr:
def pow(base: Expr, exponent: Expr) -> Expr:
"""Returns ``base`` raised to the power of ``exponent``.
- This is an alias of :py:func:`power`.
+ See Also:
+ This is an alias of :py:func:`power`.
"""
return power(base, exponent)
@@ -1350,6 +1380,17 @@ def regexp_like(string: Expr, regex: Expr, flags: Expr |
None = None) -> Expr:
... )
>>> result.collect_column("m")[0].as_py()
True
+
+ Use ``flags`` for case-insensitive matching:
+
+ >>> result = df.select(
+ ... dfn.functions.regexp_like(
+ ... dfn.col("a"), dfn.lit("HELLO"),
+ ... flags=dfn.lit("i"),
+ ... ).alias("m")
+ ... )
+ >>> result.collect_column("m")[0].as_py()
+ True
"""
if flags is not None:
flags = flags.expr
@@ -1372,6 +1413,17 @@ def regexp_match(string: Expr, regex: Expr, flags: Expr
| None = None) -> Expr:
... )
>>> result.collect_column("m")[0].as_py()
['42']
+
+ Use ``flags`` for case-insensitive matching:
+
+ >>> result = df.select(
+ ... dfn.functions.regexp_match(
+ ... dfn.col("a"), dfn.lit("(HELLO)"),
+ ... flags=dfn.lit("i"),
+ ... ).alias("m")
+ ... )
+ >>> result.collect_column("m")[0].as_py()
+ ['hello']
"""
if flags is not None:
flags = flags.expr
@@ -1400,6 +1452,18 @@ def regexp_replace(
... )
>>> result.collect_column("r")[0].as_py()
'hello XX'
+
+ Use the ``g`` flag to replace all occurrences:
+
+ >>> df = ctx.from_pydict({"a": ["a1 b2 c3"]})
+ >>> result = df.select(
+ ... dfn.functions.regexp_replace(
+ ... dfn.col("a"), dfn.lit("\\d+"),
+ ... dfn.lit("X"), flags=dfn.lit("g"),
+ ... ).alias("r")
+ ... )
+ >>> result.collect_column("r")[0].as_py()
+ 'aX bX cX'
"""
if flags is not None:
flags = flags.expr
@@ -1418,9 +1482,22 @@ def regexp_count(
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": ["abcabc"]})
>>> result = df.select(
- ... dfn.functions.regexp_count(dfn.col("a"),
dfn.lit("abc")).alias("c"))
+ ... dfn.functions.regexp_count(
+ ... dfn.col("a"), dfn.lit("abc")
+ ... ).alias("c"))
>>> result.collect_column("c")[0].as_py()
2
+
+ Use ``start`` to begin searching from a position, and
+ ``flags`` for case-insensitive matching:
+
+ >>> result = df.select(
+ ... dfn.functions.regexp_count(
+ ... dfn.col("a"), dfn.lit("ABC"),
+ ... start=dfn.lit(4), flags=dfn.lit("i"),
+ ... ).alias("c"))
+ >>> result.collect_column("c")[0].as_py()
+ 1
"""
if flags is not None:
flags = flags.expr
@@ -1456,6 +1533,31 @@ def regexp_instr(
... )
>>> result.collect_column("pos")[0].as_py()
7
+
+ Use ``start`` to search from a position, ``n`` for the
+ nth occurrence, and ``flags`` for case-insensitive mode:
+
+ >>> df = ctx.from_pydict({"a": ["abc ABC abc"]})
+ >>> result = df.select(
+ ... dfn.functions.regexp_instr(
+ ... dfn.col("a"), dfn.lit("abc"),
+ ... start=dfn.lit(2), n=dfn.lit(1),
+ ... flags=dfn.lit("i"),
+ ... ).alias("pos")
+ ... )
+ >>> result.collect_column("pos")[0].as_py()
+ 5
+
+ Use ``sub_expr`` to get the position of a capture group:
+
+ >>> result = df.select(
+ ... dfn.functions.regexp_instr(
+ ... dfn.col("a"), dfn.lit("(abc)"),
+ ... sub_expr=dfn.lit(1),
+ ... ).alias("pos")
+ ... )
+ >>> result.collect_column("pos")[0].as_py()
+ 1
"""
start = start.expr if start is not None else None
n = n.expr if n is not None else None
@@ -1478,12 +1580,12 @@ def repeat(string: Expr, n: Expr) -> Expr:
"""Repeats the ``string`` to ``n`` times.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["ha"]})
- >>> result = df.select(dfn.functions.repeat(dfn.col("a"),
dfn.lit(3)).alias("r"))
- >>> result.collect_column("r")[0].as_py()
- 'hahaha'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["ha"]})
+ >>> result = df.select(
+ ... dfn.functions.repeat(dfn.col("a"), dfn.lit(3)).alias("r"))
+ >>> result.collect_column("r")[0].as_py()
+ 'hahaha'
"""
return Expr(f.repeat(string.expr, n.expr))
@@ -1492,14 +1594,13 @@ def replace(string: Expr, from_val: Expr, to_val: Expr)
-> Expr:
"""Replaces all occurrences of ``from_val`` with ``to_val`` in the
``string``.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["hello world"]})
- >>> result = df.select(
- ... dfn.functions.replace(dfn.col("a"), dfn.lit("world"),
- ... dfn.lit("there")).alias("r"))
- >>> result.collect_column("r")[0].as_py()
- 'hello there'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["hello world"]})
+ >>> result = df.select(
+ ... dfn.functions.replace(dfn.col("a"), dfn.lit("world"),
+ ... dfn.lit("there")).alias("r"))
+ >>> result.collect_column("r")[0].as_py()
+ 'hello there'
"""
return Expr(f.replace(string.expr, from_val.expr, to_val.expr))
@@ -1508,12 +1609,11 @@ def reverse(arg: Expr) -> Expr:
"""Reverse the string argument.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["hello"]})
- >>> result = df.select(dfn.functions.reverse(dfn.col("a")).alias("r"))
- >>> result.collect_column("r")[0].as_py()
- 'olleh'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["hello"]})
+ >>> result = df.select(dfn.functions.reverse(dfn.col("a")).alias("r"))
+ >>> result.collect_column("r")[0].as_py()
+ 'olleh'
"""
return Expr(f.reverse(arg.expr))
@@ -1522,12 +1622,11 @@ def right(string: Expr, n: Expr) -> Expr:
"""Returns the last ``n`` characters in the ``string``.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["hello"]})
- >>> result = df.select(dfn.functions.right(dfn.col("a"),
dfn.lit(3)).alias("r"))
- >>> result.collect_column("r")[0].as_py()
- 'llo'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["hello"]})
+ >>> result = df.select(dfn.functions.right(dfn.col("a"),
dfn.lit(3)).alias("r"))
+ >>> result.collect_column("r")[0].as_py()
+ 'llo'
"""
return Expr(f.right(string.expr, n.expr))
@@ -1558,13 +1657,12 @@ def rpad(string: Expr, count: Expr, characters: Expr |
None = None) -> Expr:
by default). If the string is already longer than length then it is
truncated.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["hi"]})
- >>> result = df.select(
- ... dfn.functions.rpad(dfn.col("a"), dfn.lit(5),
dfn.lit("!")).alias("r"))
- >>> result.collect_column("r")[0].as_py()
- 'hi!!!'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["hi"]})
+ >>> result = df.select(
+ ... dfn.functions.rpad(dfn.col("a"), dfn.lit(5),
dfn.lit("!")).alias("r"))
+ >>> result.collect_column("r")[0].as_py()
+ 'hi!!!'
"""
characters = characters if characters is not None else Expr.literal(" ")
return Expr(f.rpad(string.expr, count.expr, characters.expr))
@@ -1574,12 +1672,11 @@ def rtrim(arg: Expr) -> Expr:
"""Removes all characters, spaces by default, from the end of a string.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [" a "]})
- >>> trim_df = df.select(dfn.functions.rtrim(dfn.col("a")).alias("trimmed"))
- >>> trim_df.collect_column("trimmed")[0].as_py()
- ' a'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [" a "]})
+ >>> trim_df =
df.select(dfn.functions.rtrim(dfn.col("a")).alias("trimmed"))
+ >>> trim_df.collect_column("trimmed")[0].as_py()
+ ' a'
"""
return Expr(f.rtrim(arg.expr))
@@ -1690,13 +1787,14 @@ def split_part(string: Expr, delimiter: Expr, index:
Expr) -> Expr:
on the index.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["a,b,c"]})
- >>> result = df.select(
- ... dfn.functions.split_part(dfn.col("a"), dfn.lit(","),
dfn.lit(2)).alias("s"))
- >>> result.collect_column("s")[0].as_py()
- 'b'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["a,b,c"]})
+ >>> result = df.select(
+ ... dfn.functions.split_part(
+ ... dfn.col("a"), dfn.lit(","), dfn.lit(2)
+ ... ).alias("s"))
+ >>> result.collect_column("s")[0].as_py()
+ 'b'
"""
return Expr(f.split_part(string.expr, delimiter.expr, index.expr))
@@ -1718,13 +1816,12 @@ def starts_with(string: Expr, prefix: Expr) -> Expr:
"""Returns true if string starts with prefix.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["hello_from_datafusion"]})
- >>> result = df.select(
- ... dfn.functions.starts_with(dfn.col("a"),
dfn.lit("hello")).alias("sw"))
- >>> result.collect_column("sw")[0].as_py()
- True
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["hello_from_datafusion"]})
+ >>> result = df.select(
+ ... dfn.functions.starts_with(dfn.col("a"),
dfn.lit("hello")).alias("sw"))
+ >>> result.collect_column("sw")[0].as_py()
+ True
"""
return Expr(f.starts_with(string.expr, prefix.expr))
@@ -1733,13 +1830,12 @@ def strpos(string: Expr, substring: Expr) -> Expr:
"""Finds the position from where the ``substring`` matches the ``string``.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["hello"]})
- >>> result = df.select(
- ... dfn.functions.strpos(dfn.col("a"), dfn.lit("llo")).alias("pos"))
- >>> result.collect_column("pos")[0].as_py()
- 3
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["hello"]})
+ >>> result = df.select(
+ ... dfn.functions.strpos(dfn.col("a"),
dfn.lit("llo")).alias("pos"))
+ >>> result.collect_column("pos")[0].as_py()
+ 3
"""
return Expr(f.strpos(string.expr, substring.expr))
@@ -1748,12 +1844,12 @@ def substr(string: Expr, position: Expr) -> Expr:
"""Substring from the ``position`` to the end.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["hello"]})
- >>> result = df.select(dfn.functions.substr(dfn.col("a"),
dfn.lit(3)).alias("s"))
- >>> result.collect_column("s")[0].as_py()
- 'llo'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["hello"]})
+ >>> result = df.select(
+ ... dfn.functions.substr(dfn.col("a"), dfn.lit(3)).alias("s"))
+ >>> result.collect_column("s")[0].as_py()
+ 'llo'
"""
return Expr(f.substr(string.expr, position.expr))
@@ -1765,14 +1861,13 @@ def substr_index(string: Expr, delimiter: Expr, count:
Expr) -> Expr:
``delimiter``.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["a.b.c"]})
- >>> result = df.select(
- ... dfn.functions.substr_index(dfn.col("a"), dfn.lit("."),
- ... dfn.lit(2)).alias("s"))
- >>> result.collect_column("s")[0].as_py()
- 'a.b'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["a.b.c"]})
+ >>> result = df.select(
+ ... dfn.functions.substr_index(dfn.col("a"), dfn.lit("."),
+ ... dfn.lit(2)).alias("s"))
+ >>> result.collect_column("s")[0].as_py()
+ 'a.b'
"""
return Expr(f.substr_index(string.expr, delimiter.expr, count.expr))
@@ -1781,13 +1876,14 @@ def substring(string: Expr, position: Expr, length:
Expr) -> Expr:
"""Substring from the ``position`` with ``length`` characters.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["hello world"]})
- >>> result = df.select(
- ... dfn.functions.substring(dfn.col("a"), dfn.lit(1),
dfn.lit(5)).alias("s"))
- >>> result.collect_column("s")[0].as_py()
- 'hello'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["hello world"]})
+ >>> result = df.select(
+ ... dfn.functions.substring(
+ ... dfn.col("a"), dfn.lit(1), dfn.lit(5)
+ ... ).alias("s"))
+ >>> result.collect_column("s")[0].as_py()
+ 'hello'
"""
return Expr(f.substring(string.expr, position.expr, length.expr))
@@ -1822,12 +1918,11 @@ def to_hex(arg: Expr) -> Expr:
"""Converts an integer to a hexadecimal string.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [255]})
- >>> result = df.select(dfn.functions.to_hex(dfn.col("a")).alias("hex"))
- >>> result.collect_column("hex")[0].as_py()
- 'ff'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [255]})
+ >>> result = df.select(dfn.functions.to_hex(dfn.col("a")).alias("hex"))
+ >>> result.collect_column("hex")[0].as_py()
+ 'ff'
"""
return Expr(f.to_hex(arg.expr))
@@ -1859,6 +1954,18 @@ def to_char(arg: Expr, formatter: Expr) -> Expr:
For usage of ``formatter`` see the rust chrono package ``strftime``
package.
[Documentation
here.](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
+
+ Examples:
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["2021-01-01T00:00:00"]})
+ >>> result = df.select(
+ ... dfn.functions.to_char(
+ ... dfn.functions.to_timestamp(dfn.col("a")),
+ ... dfn.lit("%Y/%m/%d"),
+ ... ).alias("formatted")
+ ... )
+ >>> result.collect_column("formatted")[0].as_py()
+ '2021/01/01'
"""
return Expr(f.to_char(arg.expr, formatter.expr))
@@ -1878,6 +1985,14 @@ def to_date(arg: Expr, *formatters: Expr) -> Expr:
For usage of ``formatters`` see the rust chrono package ``strftime``
package.
[Documentation
here.](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
+
+ Examples:
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["2021-07-20"]})
+ >>> result = df.select(
+ ... dfn.functions.to_date(dfn.col("a")).alias("dt"))
+ >>> str(result.collect_column("dt")[0].as_py())
+ '2021-07-20'
"""
return Expr(f.to_date(arg.expr, *_unwrap_exprs(formatters)))
@@ -1899,6 +2014,14 @@ def to_time(arg: Expr, *formatters: Expr) -> Expr:
For usage of ``formatters`` see the rust chrono package ``strftime``
package.
[Documentation
here.](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
+
+ Examples:
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["14:30:00"]})
+ >>> result = df.select(
+ ... dfn.functions.to_time(dfn.col("a")).alias("t"))
+ >>> str(result.collect_column("t")[0].as_py())
+ '14:30:00'
"""
return Expr(f.to_time(arg.expr, *_unwrap_exprs(formatters)))
@@ -2053,7 +2176,8 @@ def current_time() -> Expr:
def datepart(part: Expr, date: Expr) -> Expr:
"""Return a specified part of a date.
- This is an alias for :py:func:`date_part`.
+ See Also:
+ This is an alias for :py:func:`date_part`.
"""
return date_part(part, date)
@@ -2076,7 +2200,8 @@ def date_part(part: Expr, date: Expr) -> Expr:
def extract(part: Expr, date: Expr) -> Expr:
"""Extracts a subfield from the date.
- This is an alias for :py:func:`date_part`.
+ See Also:
+ This is an alias for :py:func:`date_part`.
"""
return date_part(part, date)
@@ -2102,7 +2227,8 @@ def date_trunc(part: Expr, date: Expr) -> Expr:
def datetrunc(part: Expr, date: Expr) -> Expr:
"""Truncates the date to a specified level of precision.
- This is an alias for :py:func:`date_trunc`.
+ See Also:
+ This is an alias for :py:func:`date_trunc`.
"""
return date_trunc(part, date)
@@ -2148,14 +2274,13 @@ def translate(string: Expr, from_val: Expr, to_val:
Expr) -> Expr:
"""Replaces the characters in ``from_val`` with the counterpart in
``to_val``.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["hello"]})
- >>> result = df.select(
- ... dfn.functions.translate(dfn.col("a"), dfn.lit("helo"),
- ... dfn.lit("HELO")).alias("t"))
- >>> result.collect_column("t")[0].as_py()
- 'HELLO'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["hello"]})
+ >>> result = df.select(
+ ... dfn.functions.translate(dfn.col("a"), dfn.lit("helo"),
+ ... dfn.lit("HELO")).alias("t"))
+ >>> result.collect_column("t")[0].as_py()
+ 'HELLO'
"""
return Expr(f.translate(string.expr, from_val.expr, to_val.expr))
@@ -2164,12 +2289,11 @@ def trim(arg: Expr) -> Expr:
"""Removes all characters, spaces by default, from both sides of a string.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [" hello "]})
- >>> result = df.select(dfn.functions.trim(dfn.col("a")).alias("t"))
- >>> result.collect_column("t")[0].as_py()
- 'hello'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [" hello "]})
+ >>> result = df.select(dfn.functions.trim(dfn.col("a")).alias("t"))
+ >>> result.collect_column("t")[0].as_py()
+ 'hello'
"""
return Expr(f.trim(arg.expr))
@@ -2180,9 +2304,19 @@ def trunc(num: Expr, precision: Expr | None = None) ->
Expr:
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.567]})
- >>> result = df.select(dfn.functions.trunc(dfn.col("a")).alias("t"))
+ >>> result = df.select(
+ ... dfn.functions.trunc(
+ ... dfn.col("a")
+ ... ).alias("t"))
>>> result.collect_column("t")[0].as_py()
1.0
+
+ >>> result = df.select(
+ ... dfn.functions.trunc(
+ ... dfn.col("a"), precision=dfn.lit(2)
+ ... ).alias("t"))
+ >>> result.collect_column("t")[0].as_py()
+ 1.56
"""
if precision is not None:
return Expr(f.trunc(num.expr, precision.expr))
@@ -2193,12 +2327,11 @@ def upper(arg: Expr) -> Expr:
"""Converts a string to uppercase.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["hello"]})
- >>> result = df.select(dfn.functions.upper(dfn.col("a")).alias("u"))
- >>> result.collect_column("u")[0].as_py()
- 'HELLO'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["hello"]})
+ >>> result = df.select(dfn.functions.upper(dfn.col("a")).alias("u"))
+ >>> result.collect_column("u")[0].as_py()
+ 'HELLO'
"""
return Expr(f.upper(arg.expr))
@@ -2207,13 +2340,14 @@ def make_array(*args: Expr) -> Expr:
"""Returns an array using the specified input expressions.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [1]})
- >>> result = df.select(
- ... dfn.functions.make_array(dfn.lit(1), dfn.lit(2),
dfn.lit(3)).alias("arr"))
- >>> result.collect_column("arr")[0].as_py()
- [1, 2, 3]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [1]})
+ >>> result = df.select(
+ ... dfn.functions.make_array(
+ ... dfn.lit(1), dfn.lit(2), dfn.lit(3)
+ ... ).alias("arr"))
+ >>> result.collect_column("arr")[0].as_py()
+ [1, 2, 3]
"""
args = [arg.expr for arg in args]
return Expr(f.make_array(args))
@@ -2222,7 +2356,8 @@ def make_array(*args: Expr) -> Expr:
def make_list(*args: Expr) -> Expr:
"""Returns an array using the specified input expressions.
- This is an alias for :py:func:`make_array`.
+ See Also:
+ This is an alias for :py:func:`make_array`.
"""
return make_array(*args)
@@ -2230,7 +2365,8 @@ def make_list(*args: Expr) -> Expr:
def array(*args: Expr) -> Expr:
"""Returns an array using the specified input expressions.
- This is an alias for :py:func:`make_array`.
+ See Also:
+ This is an alias for :py:func:`make_array`.
"""
return make_array(*args)
@@ -2239,13 +2375,12 @@ def range(start: Expr, stop: Expr, step: Expr) -> Expr:
"""Create a list of values in the range between start and stop.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [1]})
- >>> result = df.select(
- ... dfn.functions.range(dfn.lit(0), dfn.lit(5), dfn.lit(2)).alias("r"))
- >>> result.collect_column("r")[0].as_py()
- [0, 2, 4]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [1]})
+ >>> result = df.select(
+ ... dfn.functions.range(dfn.lit(0), dfn.lit(5),
dfn.lit(2)).alias("r"))
+ >>> result.collect_column("r")[0].as_py()
+ [0, 2, 4]
"""
return Expr(f.range(start.expr, stop.expr, step.expr))
@@ -2377,13 +2512,12 @@ def array_append(array: Expr, element: Expr) -> Expr:
"""Appends an element to the end of an array.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2, 3]]})
- >>> result = df.select(
- ... dfn.functions.array_append(dfn.col("a"),
dfn.lit(4)).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- [1, 2, 3, 4]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2, 3]]})
+ >>> result = df.select(
+ ... dfn.functions.array_append(dfn.col("a"),
dfn.lit(4)).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ [1, 2, 3, 4]
"""
return Expr(f.array_append(array.expr, element.expr))
@@ -2391,7 +2525,8 @@ def array_append(array: Expr, element: Expr) -> Expr:
def array_push_back(array: Expr, element: Expr) -> Expr:
"""Appends an element to the end of an array.
- This is an alias for :py:func:`array_append`.
+ See Also:
+ This is an alias for :py:func:`array_append`.
"""
return array_append(array, element)
@@ -2399,7 +2534,8 @@ def array_push_back(array: Expr, element: Expr) -> Expr:
def list_append(array: Expr, element: Expr) -> Expr:
"""Appends an element to the end of an array.
- This is an alias for :py:func:`array_append`.
+ See Also:
+ This is an alias for :py:func:`array_append`.
"""
return array_append(array, element)
@@ -2407,7 +2543,8 @@ def list_append(array: Expr, element: Expr) -> Expr:
def list_push_back(array: Expr, element: Expr) -> Expr:
"""Appends an element to the end of an array.
- This is an alias for :py:func:`array_append`.
+ See Also:
+ This is an alias for :py:func:`array_append`.
"""
return array_append(array, element)
@@ -2416,13 +2553,12 @@ def array_concat(*args: Expr) -> Expr:
"""Concatenates the input arrays.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2]], "b": [[3, 4]]})
- >>> result = df.select(
- ... dfn.functions.array_concat(dfn.col("a"),
dfn.col("b")).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- [1, 2, 3, 4]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2]], "b": [[3, 4]]})
+ >>> result = df.select(
+ ... dfn.functions.array_concat(dfn.col("a"),
dfn.col("b")).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ [1, 2, 3, 4]
"""
args = [arg.expr for arg in args]
return Expr(f.array_concat(args))
@@ -2431,7 +2567,8 @@ def array_concat(*args: Expr) -> Expr:
def array_cat(*args: Expr) -> Expr:
"""Concatenates the input arrays.
- This is an alias for :py:func:`array_concat`.
+ See Also:
+ This is an alias for :py:func:`array_concat`.
"""
return array_concat(*args)
@@ -2440,12 +2577,11 @@ def array_dims(array: Expr) -> Expr:
"""Returns an array of the array's dimensions.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2, 3]]})
- >>> result =
df.select(dfn.functions.array_dims(dfn.col("a")).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- [3]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2, 3]]})
+ >>> result =
df.select(dfn.functions.array_dims(dfn.col("a")).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ [3]
"""
return Expr(f.array_dims(array.expr))
@@ -2454,18 +2590,17 @@ def array_distinct(array: Expr) -> Expr:
"""Returns distinct values from the array after removing duplicates.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 1, 2, 3]]})
- >>> result = df.select(
- ... dfn.functions.array_distinct(
- ... dfn.col("a")
- ... ).alias("result")
- ... )
- >>> sorted(
- ... result.collect_column("result")[0].as_py()
- ... )
- [1, 2, 3]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 1, 2, 3]]})
+ >>> result = df.select(
+ ... dfn.functions.array_distinct(
+ ... dfn.col("a")
+ ... ).alias("result")
+ ... )
+ >>> sorted(
+ ... result.collect_column("result")[0].as_py()
+ ... )
+ [1, 2, 3]
"""
return Expr(f.array_distinct(array.expr))
@@ -2473,7 +2608,8 @@ def array_distinct(array: Expr) -> Expr:
def list_cat(*args: Expr) -> Expr:
"""Concatenates the input arrays.
- This is an alias for :py:func:`array_concat`, :py:func:`array_cat`.
+ See Also:
+ This is an alias for :py:func:`array_concat`, :py:func:`array_cat`.
"""
return array_concat(*args)
@@ -2481,7 +2617,8 @@ def list_cat(*args: Expr) -> Expr:
def list_concat(*args: Expr) -> Expr:
"""Concatenates the input arrays.
- This is an alias for :py:func:`array_concat`, :py:func:`array_cat`.
+ See Also:
+ This is an alias for :py:func:`array_concat`, :py:func:`array_cat`.
"""
return array_concat(*args)
@@ -2489,7 +2626,8 @@ def list_concat(*args: Expr) -> Expr:
def list_distinct(array: Expr) -> Expr:
"""Returns distinct values from the array after removing duplicates.
- This is an alias for :py:func:`array_distinct`.
+ See Also:
+ This is an alias for :py:func:`array_distinct`.
"""
return array_distinct(array)
@@ -2497,7 +2635,8 @@ def list_distinct(array: Expr) -> Expr:
def list_dims(array: Expr) -> Expr:
"""Returns an array of the array's dimensions.
- This is an alias for :py:func:`array_dims`.
+ See Also:
+ This is an alias for :py:func:`array_dims`.
"""
return array_dims(array)
@@ -2506,13 +2645,12 @@ def array_element(array: Expr, n: Expr) -> Expr:
"""Extracts the element with the index n from the array.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[10, 20, 30]]})
- >>> result = df.select(
- ... dfn.functions.array_element(dfn.col("a"),
dfn.lit(2)).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- 20
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[10, 20, 30]]})
+ >>> result = df.select(
+ ... dfn.functions.array_element(dfn.col("a"),
dfn.lit(2)).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ 20
"""
return Expr(f.array_element(array.expr, n.expr))
@@ -2521,12 +2659,11 @@ def array_empty(array: Expr) -> Expr:
"""Returns a boolean indicating whether the array is empty.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2]]})
- >>> result =
df.select(dfn.functions.array_empty(dfn.col("a")).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- False
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2]]})
+ >>> result =
df.select(dfn.functions.array_empty(dfn.col("a")).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ False
"""
return Expr(f.array_empty(array.expr))
@@ -2534,7 +2671,8 @@ def array_empty(array: Expr) -> Expr:
def array_extract(array: Expr, n: Expr) -> Expr:
"""Extracts the element with the index n from the array.
- This is an alias for :py:func:`array_element`.
+ See Also:
+ This is an alias for :py:func:`array_element`.
"""
return array_element(array, n)
@@ -2542,7 +2680,8 @@ def array_extract(array: Expr, n: Expr) -> Expr:
def list_element(array: Expr, n: Expr) -> Expr:
"""Extracts the element with the index n from the array.
- This is an alias for :py:func:`array_element`.
+ See Also:
+ This is an alias for :py:func:`array_element`.
"""
return array_element(array, n)
@@ -2550,7 +2689,8 @@ def list_element(array: Expr, n: Expr) -> Expr:
def list_extract(array: Expr, n: Expr) -> Expr:
"""Extracts the element with the index n from the array.
- This is an alias for :py:func:`array_element`.
+ See Also:
+ This is an alias for :py:func:`array_element`.
"""
return array_element(array, n)
@@ -2559,12 +2699,11 @@ def array_length(array: Expr) -> Expr:
"""Returns the length of the array.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2, 3]]})
- >>> result =
df.select(dfn.functions.array_length(dfn.col("a")).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- 3
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2, 3]]})
+ >>> result =
df.select(dfn.functions.array_length(dfn.col("a")).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ 3
"""
return Expr(f.array_length(array.expr))
@@ -2572,7 +2711,8 @@ def array_length(array: Expr) -> Expr:
def list_length(array: Expr) -> Expr:
"""Returns the length of the array.
- This is an alias for :py:func:`array_length`.
+ See Also:
+ This is an alias for :py:func:`array_length`.
"""
return array_length(array)
@@ -2581,13 +2721,12 @@ def array_has(first_array: Expr, second_array: Expr) ->
Expr:
"""Returns true if the element appears in the first array, otherwise false.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2, 3]]})
- >>> result = df.select(
- ... dfn.functions.array_has(dfn.col("a"), dfn.lit(2)).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- True
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2, 3]]})
+ >>> result = df.select(
+ ... dfn.functions.array_has(dfn.col("a"),
dfn.lit(2)).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ True
"""
return Expr(f.array_has(first_array.expr, second_array.expr))
@@ -2599,13 +2738,12 @@ def array_has_all(first_array: Expr, second_array:
Expr) -> Expr:
Otherwise, it returns false.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[1, 2]]})
- >>> result = df.select(
- ... dfn.functions.array_has_all(dfn.col("a"),
dfn.col("b")).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- True
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[1, 2]]})
+ >>> result = df.select(
+ ... dfn.functions.array_has_all(dfn.col("a"),
dfn.col("b")).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ True
"""
return Expr(f.array_has_all(first_array.expr, second_array.expr))
@@ -2617,13 +2755,12 @@ def array_has_any(first_array: Expr, second_array:
Expr) -> Expr:
array. Otherwise, it returns false.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[2, 5]]})
- >>> result = df.select(
- ... dfn.functions.array_has_any(dfn.col("a"),
dfn.col("b")).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- True
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[2, 5]]})
+ >>> result = df.select(
+ ... dfn.functions.array_has_any(dfn.col("a"),
dfn.col("b")).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ True
"""
return Expr(f.array_has_any(first_array.expr, second_array.expr))
@@ -2632,13 +2769,24 @@ def array_position(array: Expr, element: Expr, index:
int | None = 1) -> Expr:
"""Return the position of the first occurrence of ``element`` in ``array``.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[10, 20, 30]]})
- >>> result = df.select(
- ... dfn.functions.array_position(dfn.col("a"),
dfn.lit(20)).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- 2
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[10, 20, 30]]})
+ >>> result = df.select(
+ ... dfn.functions.array_position(
+ ... dfn.col("a"), dfn.lit(20)
+ ... ).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ 2
+
+ Use ``index`` to start searching from a given position:
+
+ >>> df = ctx.from_pydict({"a": [[10, 20, 10, 20]]})
+ >>> result = df.select(
+ ... dfn.functions.array_position(
+ ... dfn.col("a"), dfn.lit(20), index=3,
+ ... ).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ 4
"""
return Expr(f.array_position(array.expr, element.expr, index))
@@ -2646,7 +2794,8 @@ def array_position(array: Expr, element: Expr, index: int
| None = 1) -> Expr:
def array_indexof(array: Expr, element: Expr, index: int | None = 1) -> Expr:
"""Return the position of the first occurrence of ``element`` in ``array``.
- This is an alias for :py:func:`array_position`.
+ See Also:
+ This is an alias for :py:func:`array_position`.
"""
return array_position(array, element, index)
@@ -2654,7 +2803,8 @@ def array_indexof(array: Expr, element: Expr, index: int
| None = 1) -> Expr:
def list_position(array: Expr, element: Expr, index: int | None = 1) -> Expr:
"""Return the position of the first occurrence of ``element`` in ``array``.
- This is an alias for :py:func:`array_position`.
+ See Also:
+ This is an alias for :py:func:`array_position`.
"""
return array_position(array, element, index)
@@ -2662,7 +2812,8 @@ def list_position(array: Expr, element: Expr, index: int
| None = 1) -> Expr:
def list_indexof(array: Expr, element: Expr, index: int | None = 1) -> Expr:
"""Return the position of the first occurrence of ``element`` in ``array``.
- This is an alias for :py:func:`array_position`.
+ See Also:
+ This is an alias for :py:func:`array_position`.
"""
return array_position(array, element, index)
@@ -2671,13 +2822,12 @@ def array_positions(array: Expr, element: Expr) -> Expr:
"""Searches for an element in the array and returns all occurrences.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2, 1]]})
- >>> result = df.select(
- ... dfn.functions.array_positions(dfn.col("a"),
dfn.lit(1)).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- [1, 3]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2, 1]]})
+ >>> result = df.select(
+ ... dfn.functions.array_positions(dfn.col("a"),
dfn.lit(1)).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ [1, 3]
"""
return Expr(f.array_positions(array.expr, element.expr))
@@ -2685,7 +2835,8 @@ def array_positions(array: Expr, element: Expr) -> Expr:
def list_positions(array: Expr, element: Expr) -> Expr:
"""Searches for an element in the array and returns all occurrences.
- This is an alias for :py:func:`array_positions`.
+ See Also:
+ This is an alias for :py:func:`array_positions`.
"""
return array_positions(array, element)
@@ -2694,12 +2845,11 @@ def array_ndims(array: Expr) -> Expr:
"""Returns the number of dimensions of the array.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2, 3]]})
- >>> result =
df.select(dfn.functions.array_ndims(dfn.col("a")).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- 1
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2, 3]]})
+ >>> result =
df.select(dfn.functions.array_ndims(dfn.col("a")).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ 1
"""
return Expr(f.array_ndims(array.expr))
@@ -2707,7 +2857,8 @@ def array_ndims(array: Expr) -> Expr:
def list_ndims(array: Expr) -> Expr:
"""Returns the number of dimensions of the array.
- This is an alias for :py:func:`array_ndims`.
+ See Also:
+ This is an alias for :py:func:`array_ndims`.
"""
return array_ndims(array)
@@ -2716,13 +2867,12 @@ def array_prepend(element: Expr, array: Expr) -> Expr:
"""Prepends an element to the beginning of an array.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2]]})
- >>> result = df.select(
- ... dfn.functions.array_prepend(dfn.lit(0),
dfn.col("a")).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- [0, 1, 2]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2]]})
+ >>> result = df.select(
+ ... dfn.functions.array_prepend(dfn.lit(0),
dfn.col("a")).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ [0, 1, 2]
"""
return Expr(f.array_prepend(element.expr, array.expr))
@@ -2730,7 +2880,8 @@ def array_prepend(element: Expr, array: Expr) -> Expr:
def array_push_front(element: Expr, array: Expr) -> Expr:
"""Prepends an element to the beginning of an array.
- This is an alias for :py:func:`array_prepend`.
+ See Also:
+ This is an alias for :py:func:`array_prepend`.
"""
return array_prepend(element, array)
@@ -2738,7 +2889,8 @@ def array_push_front(element: Expr, array: Expr) -> Expr:
def list_prepend(element: Expr, array: Expr) -> Expr:
"""Prepends an element to the beginning of an array.
- This is an alias for :py:func:`array_prepend`.
+ See Also:
+ This is an alias for :py:func:`array_prepend`.
"""
return array_prepend(element, array)
@@ -2746,7 +2898,8 @@ def list_prepend(element: Expr, array: Expr) -> Expr:
def list_push_front(element: Expr, array: Expr) -> Expr:
"""Prepends an element to the beginning of an array.
- This is an alias for :py:func:`array_prepend`.
+ See Also:
+ This is an alias for :py:func:`array_prepend`.
"""
return array_prepend(element, array)
@@ -2755,12 +2908,12 @@ def array_pop_back(array: Expr) -> Expr:
"""Returns the array without the last element.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2, 3]]})
- >>> result =
df.select(dfn.functions.array_pop_back(dfn.col("a")).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- [1, 2]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2, 3]]})
+ >>> result = df.select(
+ ... dfn.functions.array_pop_back(dfn.col("a")).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ [1, 2]
"""
return Expr(f.array_pop_back(array.expr))
@@ -2769,12 +2922,12 @@ def array_pop_front(array: Expr) -> Expr:
"""Returns the array without the first element.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2, 3]]})
- >>> result =
df.select(dfn.functions.array_pop_front(dfn.col("a")).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- [2, 3]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2, 3]]})
+ >>> result = df.select(
+ ... dfn.functions.array_pop_front(dfn.col("a")).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ [2, 3]
"""
return Expr(f.array_pop_front(array.expr))
@@ -2783,13 +2936,12 @@ def array_remove(array: Expr, element: Expr) -> Expr:
"""Removes the first element from the array equal to the given value.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2, 1]]})
- >>> result = df.select(
- ... dfn.functions.array_remove(dfn.col("a"),
dfn.lit(1)).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- [2, 1]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2, 1]]})
+ >>> result = df.select(
+ ... dfn.functions.array_remove(dfn.col("a"),
dfn.lit(1)).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ [2, 1]
"""
return Expr(f.array_remove(array.expr, element.expr))
@@ -2797,7 +2949,8 @@ def array_remove(array: Expr, element: Expr) -> Expr:
def list_remove(array: Expr, element: Expr) -> Expr:
"""Removes the first element from the array equal to the given value.
- This is an alias for :py:func:`array_remove`.
+ See Also:
+ This is an alias for :py:func:`array_remove`.
"""
return array_remove(array, element)
@@ -2806,14 +2959,13 @@ def array_remove_n(array: Expr, element: Expr, max:
Expr) -> Expr:
"""Removes the first ``max`` elements from the array equal to the given
value.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2, 1, 1]]})
- >>> result = df.select(
- ... dfn.functions.array_remove_n(dfn.col("a"), dfn.lit(1),
- ... dfn.lit(2)).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- [2, 1]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2, 1, 1]]})
+ >>> result = df.select(
+ ... dfn.functions.array_remove_n(dfn.col("a"), dfn.lit(1),
+ ... dfn.lit(2)).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ [2, 1]
"""
return Expr(f.array_remove_n(array.expr, element.expr, max.expr))
@@ -2821,7 +2973,8 @@ def array_remove_n(array: Expr, element: Expr, max: Expr)
-> Expr:
def list_remove_n(array: Expr, element: Expr, max: Expr) -> Expr:
"""Removes the first ``max`` elements from the array equal to the given
value.
- This is an alias for :py:func:`array_remove_n`.
+ See Also:
+ This is an alias for :py:func:`array_remove_n`.
"""
return array_remove_n(array, element, max)
@@ -2830,13 +2983,14 @@ def array_remove_all(array: Expr, element: Expr) ->
Expr:
"""Removes all elements from the array equal to the given value.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2, 1]]})
- >>> result = df.select(
- ... dfn.functions.array_remove_all(dfn.col("a"),
dfn.lit(1)).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- [2]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2, 1]]})
+ >>> result = df.select(
+ ... dfn.functions.array_remove_all(
+ ... dfn.col("a"), dfn.lit(1)
+ ... ).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ [2]
"""
return Expr(f.array_remove_all(array.expr, element.expr))
@@ -2844,7 +2998,8 @@ def array_remove_all(array: Expr, element: Expr) -> Expr:
def list_remove_all(array: Expr, element: Expr) -> Expr:
"""Removes all elements from the array equal to the given value.
- This is an alias for :py:func:`array_remove_all`.
+ See Also:
+ This is an alias for :py:func:`array_remove_all`.
"""
return array_remove_all(array, element)
@@ -2853,13 +3008,12 @@ def array_repeat(element: Expr, count: Expr) -> Expr:
"""Returns an array containing ``element`` ``count`` times.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [1]})
- >>> result = df.select(
- ... dfn.functions.array_repeat(dfn.lit(3), dfn.lit(3)).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- [3, 3, 3]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [1]})
+ >>> result = df.select(
+ ... dfn.functions.array_repeat(dfn.lit(3),
dfn.lit(3)).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ [3, 3, 3]
"""
return Expr(f.array_repeat(element.expr, count.expr))
@@ -2867,7 +3021,8 @@ def array_repeat(element: Expr, count: Expr) -> Expr:
def list_repeat(element: Expr, count: Expr) -> Expr:
"""Returns an array containing ``element`` ``count`` times.
- This is an alias for :py:func:`array_repeat`.
+ See Also:
+ This is an alias for :py:func:`array_repeat`.
"""
return array_repeat(element, count)
@@ -2876,14 +3031,13 @@ def array_replace(array: Expr, from_val: Expr, to_val:
Expr) -> Expr:
"""Replaces the first occurrence of ``from_val`` with ``to_val``.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2, 1]]})
- >>> result = df.select(
- ... dfn.functions.array_replace(dfn.col("a"), dfn.lit(1),
- ... dfn.lit(9)).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- [9, 2, 1]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2, 1]]})
+ >>> result = df.select(
+ ... dfn.functions.array_replace(dfn.col("a"), dfn.lit(1),
+ ... dfn.lit(9)).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ [9, 2, 1]
"""
return Expr(f.array_replace(array.expr, from_val.expr, to_val.expr))
@@ -2891,7 +3045,8 @@ def array_replace(array: Expr, from_val: Expr, to_val:
Expr) -> Expr:
def list_replace(array: Expr, from_val: Expr, to_val: Expr) -> Expr:
"""Replaces the first occurrence of ``from_val`` with ``to_val``.
- This is an alias for :py:func:`array_replace`.
+ See Also:
+ This is an alias for :py:func:`array_replace`.
"""
return array_replace(array, from_val, to_val)
@@ -2903,14 +3058,13 @@ def array_replace_n(array: Expr, from_val: Expr,
to_val: Expr, max: Expr) -> Exp
specified element.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2, 1, 1]]})
- >>> result = df.select(
- ... dfn.functions.array_replace_n(dfn.col("a"), dfn.lit(1), dfn.lit(9),
- ... dfn.lit(2)).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- [9, 2, 9, 1]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2, 1, 1]]})
+ >>> result = df.select(
+ ... dfn.functions.array_replace_n(dfn.col("a"), dfn.lit(1),
dfn.lit(9),
+ ... dfn.lit(2)).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ [9, 2, 9, 1]
"""
return Expr(f.array_replace_n(array.expr, from_val.expr, to_val.expr,
max.expr))
@@ -2921,7 +3075,8 @@ def list_replace_n(array: Expr, from_val: Expr, to_val:
Expr, max: Expr) -> Expr
Replaces the first ``max`` occurrences of the specified element with
another
specified element.
- This is an alias for :py:func:`array_replace_n`.
+ See Also:
+ This is an alias for :py:func:`array_replace_n`.
"""
return array_replace_n(array, from_val, to_val, max)
@@ -2930,14 +3085,13 @@ def array_replace_all(array: Expr, from_val: Expr,
to_val: Expr) -> Expr:
"""Replaces all occurrences of ``from_val`` with ``to_val``.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2, 1]]})
- >>> result = df.select(
- ... dfn.functions.array_replace_all(dfn.col("a"), dfn.lit(1),
- ... dfn.lit(9)).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- [9, 2, 9]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2, 1]]})
+ >>> result = df.select(
+ ... dfn.functions.array_replace_all(dfn.col("a"), dfn.lit(1),
+ ... dfn.lit(9)).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ [9, 2, 9]
"""
return Expr(f.array_replace_all(array.expr, from_val.expr, to_val.expr))
@@ -2945,7 +3099,8 @@ def array_replace_all(array: Expr, from_val: Expr,
to_val: Expr) -> Expr:
def list_replace_all(array: Expr, from_val: Expr, to_val: Expr) -> Expr:
"""Replaces all occurrences of ``from_val`` with ``to_val``.
- This is an alias for :py:func:`array_replace_all`.
+ See Also:
+ This is an alias for :py:func:`array_replace_all`.
"""
return array_replace_all(array, from_val, to_val)
@@ -2959,12 +3114,22 @@ def array_sort(array: Expr, descending: bool = False,
null_first: bool = False)
null_first: If True, nulls will be returned at the beginning of the
array.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[3, 1, 2]]})
- >>> result =
df.select(dfn.functions.array_sort(dfn.col("a")).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- [1, 2, 3]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[3, 1, 2]]})
+ >>> result = df.select(
+ ... dfn.functions.array_sort(
+ ... dfn.col("a")
+ ... ).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ [1, 2, 3]
+
+ >>> df = ctx.from_pydict({"a": [[3, None, 1]]})
+ >>> result = df.select(
+ ... dfn.functions.array_sort(
+ ... dfn.col("a"), descending=True, null_first=True,
+ ... ).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ [None, 3, 1]
"""
desc = "DESC" if descending else "ASC"
nulls_first = "NULLS FIRST" if null_first else "NULLS LAST"
@@ -2978,7 +3143,11 @@ def array_sort(array: Expr, descending: bool = False,
null_first: bool = False)
def list_sort(array: Expr, descending: bool = False, null_first: bool = False)
-> Expr:
- """This is an alias for :py:func:`array_sort`."""
+ """Sorts the array.
+
+ See Also:
+ This is an alias for :py:func:`array_sort`.
+ """
return array_sort(array, descending=descending, null_first=null_first)
@@ -2988,14 +3157,24 @@ def array_slice(
"""Returns a slice of the array.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2, 3, 4]]})
- >>> result = df.select(
- ... dfn.functions.array_slice(dfn.col("a"), dfn.lit(2),
- ... dfn.lit(3)).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- [2, 3]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2, 3, 4]]})
+ >>> result = df.select(
+ ... dfn.functions.array_slice(
+ ... dfn.col("a"), dfn.lit(2), dfn.lit(3)
+ ... ).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ [2, 3]
+
+ Use ``stride`` to skip elements:
+
+ >>> result = df.select(
+ ... dfn.functions.array_slice(
+ ... dfn.col("a"), dfn.lit(1), dfn.lit(4),
+ ... stride=dfn.lit(2),
+ ... ).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ [1, 3]
"""
if stride is not None:
stride = stride.expr
@@ -3005,7 +3184,8 @@ def array_slice(
def list_slice(array: Expr, begin: Expr, end: Expr, stride: Expr | None =
None) -> Expr:
"""Returns a slice of the array.
- This is an alias for :py:func:`array_slice`.
+ See Also:
+ This is an alias for :py:func:`array_slice`.
"""
return array_slice(array, begin, end, stride)
@@ -3014,18 +3194,17 @@ def array_intersect(array1: Expr, array2: Expr) -> Expr:
"""Returns the intersection of ``array1`` and ``array2``.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[2, 3, 4]]})
- >>> result = df.select(
- ... dfn.functions.array_intersect(
- ... dfn.col("a"), dfn.col("b")
- ... ).alias("result")
- ... )
- >>> sorted(
- ... result.collect_column("result")[0].as_py()
- ... )
- [2, 3]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[2, 3, 4]]})
+ >>> result = df.select(
+ ... dfn.functions.array_intersect(
+ ... dfn.col("a"), dfn.col("b")
+ ... ).alias("result")
+ ... )
+ >>> sorted(
+ ... result.collect_column("result")[0].as_py()
+ ... )
+ [2, 3]
"""
return Expr(f.array_intersect(array1.expr, array2.expr))
@@ -3033,7 +3212,8 @@ def array_intersect(array1: Expr, array2: Expr) -> Expr:
def list_intersect(array1: Expr, array2: Expr) -> Expr:
"""Returns an the intersection of ``array1`` and ``array2``.
- This is an alias for :py:func:`array_intersect`.
+ See Also:
+ This is an alias for :py:func:`array_intersect`.
"""
return array_intersect(array1, array2)
@@ -3044,18 +3224,17 @@ def array_union(array1: Expr, array2: Expr) -> Expr:
Duplicate rows will not be returned.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[2, 3, 4]]})
- >>> result = df.select(
- ... dfn.functions.array_union(
- ... dfn.col("a"), dfn.col("b")
- ... ).alias("result")
- ... )
- >>> sorted(
- ... result.collect_column("result")[0].as_py()
- ... )
- [1, 2, 3, 4]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[2, 3, 4]]})
+ >>> result = df.select(
+ ... dfn.functions.array_union(
+ ... dfn.col("a"), dfn.col("b")
+ ... ).alias("result")
+ ... )
+ >>> sorted(
+ ... result.collect_column("result")[0].as_py()
+ ... )
+ [1, 2, 3, 4]
"""
return Expr(f.array_union(array1.expr, array2.expr))
@@ -3065,7 +3244,8 @@ def list_union(array1: Expr, array2: Expr) -> Expr:
Duplicate rows will not be returned.
- This is an alias for :py:func:`array_union`.
+ See Also:
+ This is an alias for :py:func:`array_union`.
"""
return array_union(array1, array2)
@@ -3074,13 +3254,12 @@ def array_except(array1: Expr, array2: Expr) -> Expr:
"""Returns the elements that appear in ``array1`` but not in ``array2``.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[2, 3, 4]]})
- >>> result = df.select(
- ... dfn.functions.array_except(dfn.col("a"),
dfn.col("b")).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- [1]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[2, 3, 4]]})
+ >>> result = df.select(
+ ... dfn.functions.array_except(dfn.col("a"),
dfn.col("b")).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ [1]
"""
return Expr(f.array_except(array1.expr, array2.expr))
@@ -3088,7 +3267,8 @@ def array_except(array1: Expr, array2: Expr) -> Expr:
def list_except(array1: Expr, array2: Expr) -> Expr:
"""Returns the elements that appear in ``array1`` but not in the
``array2``.
- This is an alias for :py:func:`array_except`.
+ See Also:
+ This is an alias for :py:func:`array_except`.
"""
return array_except(array1, array2)
@@ -3100,14 +3280,13 @@ def array_resize(array: Expr, size: Expr, value: Expr)
-> Expr:
be filled with the given ``value``.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2]]})
- >>> result = df.select(
- ... dfn.functions.array_resize(dfn.col("a"), dfn.lit(4),
- ... dfn.lit(0)).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- [1, 2, 0, 0]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2]]})
+ >>> result = df.select(
+ ... dfn.functions.array_resize(dfn.col("a"), dfn.lit(4),
+ ... dfn.lit(0)).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ [1, 2, 0, 0]
"""
return Expr(f.array_resize(array.expr, size.expr, value.expr))
@@ -3116,7 +3295,10 @@ def list_resize(array: Expr, size: Expr, value: Expr) ->
Expr:
"""Returns an array with the specified size filled.
If ``size`` is greater than the ``array`` length, the additional entries
will be
- filled with the given ``value``. This is an alias for
:py:func:`array_resize`.
+ filled with the given ``value``.
+
+ See Also:
+ This is an alias for :py:func:`array_resize`.
"""
return array_resize(array, size, value)
@@ -3125,12 +3307,11 @@ def flatten(array: Expr) -> Expr:
"""Flattens an array of arrays into a single array.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[[1, 2], [3, 4]]]})
- >>> result = df.select(dfn.functions.flatten(dfn.col("a")).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- [1, 2, 3, 4]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[[1, 2], [3, 4]]]})
+ >>> result =
df.select(dfn.functions.flatten(dfn.col("a")).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ [1, 2, 3, 4]
"""
return Expr(f.flatten(array.expr))
@@ -3139,18 +3320,21 @@ def cardinality(array: Expr) -> Expr:
"""Returns the total number of elements in the array.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [[1, 2, 3]]})
- >>> result =
df.select(dfn.functions.cardinality(dfn.col("a")).alias("result"))
- >>> result.collect_column("result")[0].as_py()
- 3
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [[1, 2, 3]]})
+ >>> result =
df.select(dfn.functions.cardinality(dfn.col("a")).alias("result"))
+ >>> result.collect_column("result")[0].as_py()
+ 3
"""
return Expr(f.cardinality(array.expr))
def empty(array: Expr) -> Expr:
- """This is an alias for :py:func:`array_empty`."""
+ """Returns true if the array is empty.
+
+ See Also:
+ This is an alias for :py:func:`array_empty`.
+ """
return array_empty(array)
@@ -3173,13 +3357,22 @@ def approx_distinct(
filter: If provided, only compute against rows for which the filter is
True
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [1, 1, 2, 3]})
- >>> result = df.aggregate(
- ... [], [dfn.functions.approx_distinct(dfn.col("a")).alias("v")])
- >>> result.collect_column("v")[0].as_py() == 3
- True
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [1, 1, 2, 3]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.approx_distinct(
+ ... dfn.col("a")
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py() == 3
+ True
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.approx_distinct(
+ ... dfn.col("a"),
+ ... filter=dfn.col("a") > dfn.lit(1)
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py() == 2
+ True
"""
filter_raw = filter.expr if filter is not None else None
@@ -3200,13 +3393,22 @@ def approx_median(expression: Expr, filter: Expr | None
= None) -> Expr:
filter: If provided, only compute against rows for which the filter is
True
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
- >>> result = df.aggregate(
- ... [], [dfn.functions.approx_median(dfn.col("a")).alias("v")])
- >>> result.collect_column("v")[0].as_py()
- 2.0
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.approx_median(
+ ... dfn.col("a")
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 2.0
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.approx_median(
+ ... dfn.col("a"),
+ ... filter=dfn.col("a") > dfn.lit(1.0)
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 2.5
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.approx_median(expression.expr, filter=filter_raw))
@@ -3240,13 +3442,23 @@ def approx_percentile_cont(
filter: If provided, only compute against rows for which the filter is
True
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0, 4.0, 5.0]})
- >>> result = df.aggregate(
- ... [], [dfn.functions.approx_percentile_cont(dfn.col("a"),
0.5).alias("v")])
- >>> result.collect_column("v")[0].as_py()
- 3.0
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0, 4.0, 5.0]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.approx_percentile_cont(
+ ... dfn.col("a"), 0.5
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 3.0
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.approx_percentile_cont(
+ ... dfn.col("a"), 0.5,
+ ... num_centroids=10,
+ ... filter=dfn.col("a") > dfn.lit(1.0),
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 3.5
"""
sort_expr_raw = sort_or_default(sort_expression)
filter_raw = filter.expr if filter is not None else None
@@ -3280,14 +3492,23 @@ def approx_percentile_cont_with_weight(
filter: If provided, only compute against rows for which the filter is
True
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "w": [1.0, 1.0, 1.0]})
- >>> result = df.aggregate(
- ... [], [dfn.functions.approx_percentile_cont_with_weight(dfn.col("a"),
- ... dfn.col("w"), 0.5).alias("v")])
- >>> result.collect_column("v")[0].as_py()
- 2.0
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "w": [1.0, 1.0, 1.0]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.approx_percentile_cont_with_weight(
+ ... dfn.col("a"), dfn.col("w"), 0.5
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 2.0
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.approx_percentile_cont_with_weight(
+ ... dfn.col("a"), dfn.col("w"), 0.5,
+ ... num_centroids=10,
+ ... filter=dfn.col("a") > dfn.lit(1.0),
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 2.5
"""
sort_expr_raw = sort_or_default(sort_expression)
filter_raw = filter.expr if filter is not None else None
@@ -3324,12 +3545,31 @@ def array_agg(
order_by: Order the resultant array values. Accepts column names or
expressions.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [1, 2, 3]})
- >>> result = df.aggregate([],
[dfn.functions.array_agg(dfn.col("a")).alias("v")])
- >>> result.collect_column("v")[0].as_py()
- [1, 2, 3]
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [1, 2, 3]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.array_agg(
+ ... dfn.col("a")
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ [1, 2, 3]
+
+ >>> df = ctx.from_pydict({"a": [3, 1, 2, 1]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.array_agg(
+ ... dfn.col("a"), distinct=True,
+ ... ).alias("v")])
+ >>> sorted(result.collect_column("v")[0].as_py())
+ [1, 2, 3]
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.array_agg(
+ ... dfn.col("a"),
+ ... filter=dfn.col("a") > dfn.lit(1),
+ ... order_by="a",
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ [2, 3]
"""
order_by_raw = sort_list_to_raw_sort_list(order_by)
filter_raw = filter.expr if filter is not None else None
@@ -3357,12 +3597,22 @@ def avg(
filter: If provided, only compute against rows for which the filter is
True
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
- >>> result = df.aggregate([], [dfn.functions.avg(dfn.col("a")).alias("v")])
- >>> result.collect_column("v")[0].as_py()
- 2.0
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.avg(
+ ... dfn.col("a")
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 2.0
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.avg(
+ ... dfn.col("a"),
+ ... filter=dfn.col("a") > dfn.lit(1.0)
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 2.5
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.avg(expression.expr, filter=filter_raw))
@@ -3385,7 +3635,17 @@ def corr(value_y: Expr, value_x: Expr, filter: Expr |
None = None) -> Expr:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [1.0, 2.0, 3.0]})
>>> result = df.aggregate(
- ... [], [dfn.functions.corr(dfn.col("a"),
dfn.col("b")).alias("v")])
+ ... [], [dfn.functions.corr(
+ ... dfn.col("a"), dfn.col("b")
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 1.0
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.corr(
+ ... dfn.col("a"), dfn.col("b"),
+ ... filter=dfn.col("a") > dfn.lit(1.0)
+ ... ).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
"""
@@ -3411,12 +3671,23 @@ def count(
filter: If provided, only compute against rows for which the filter is
True
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [1, 2, 3]})
- >>> result = df.aggregate([],
[dfn.functions.count(dfn.col("a")).alias("v")])
- >>> result.collect_column("v")[0].as_py()
- 3
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [1, 2, 3]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.count(
+ ... dfn.col("a")
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 3
+
+ >>> df = ctx.from_pydict({"a": [1, 1, 2, 3]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.count(
+ ... dfn.col("a"), distinct=True,
+ ... filter=dfn.col("a") > dfn.lit(1),
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 2
"""
filter_raw = filter.expr if filter is not None else None
@@ -3454,6 +3725,18 @@ def covar_pop(value_y: Expr, value_x: Expr, filter: Expr
| None = None) -> Expr:
... )
>>> result.collect_column("v")[0].as_py()
3.0
+
+ >>> df = ctx.from_pydict(
+ ... {"a": [0.0, 1.0, 3.0], "b": [0.0, 1.0, 3.0]})
+ >>> result = df.aggregate(
+ ... [],
+ ... [dfn.functions.covar_pop(
+ ... dfn.col("a"), dfn.col("b"),
+ ... filter=dfn.col("a") > dfn.lit(0.0)
+ ... ).alias("v")]
+ ... )
+ >>> result.collect_column("v")[0].as_py()
+ 1.0
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.covar_pop(value_y.expr, value_x.expr, filter=filter_raw))
@@ -3476,9 +3759,19 @@ def covar_samp(value_y: Expr, value_x: Expr, filter:
Expr | None = None) -> Expr
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]})
>>> result = df.aggregate(
- ... [], [dfn.functions.covar_samp(dfn.col("a"),
dfn.col("b")).alias("v")])
+ ... [], [dfn.functions.covar_samp(
+ ... dfn.col("a"), dfn.col("b")
+ ... ).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.covar_samp(
+ ... dfn.col("a"), dfn.col("b"),
+ ... filter=dfn.col("a") > dfn.lit(1.0)
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 0.5
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.covar_samp(value_y.expr, value_x.expr, filter=filter_raw))
@@ -3504,12 +3797,22 @@ def max(expression: Expr, filter: Expr | None = None)
-> Expr:
filter: If provided, only compute against rows for which the filter is
True
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [1, 2, 3]})
- >>> result = df.aggregate([], [dfn.functions.max(dfn.col("a")).alias("v")])
- >>> result.collect_column("v")[0].as_py()
- 3
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [1, 2, 3]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.max(
+ ... dfn.col("a")
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 3
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.max(
+ ... dfn.col("a"),
+ ... filter=dfn.col("a") < dfn.lit(3)
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 2
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.max(expression.expr, filter=filter_raw))
@@ -3518,15 +3821,8 @@ def max(expression: Expr, filter: Expr | None = None) ->
Expr:
def mean(expression: Expr, filter: Expr | None = None) -> Expr:
"""Returns the average (mean) value of the argument.
- This is an alias for :py:func:`avg`.
-
- Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
- >>> result = df.aggregate([],
[dfn.functions.mean(dfn.col("a")).alias("v")])
- >>> result.collect_column("v")[0].as_py()
- 2.0
+ See Also:
+ This is an alias for :py:func:`avg`.
"""
return avg(expression, filter)
@@ -3548,12 +3844,23 @@ def median(
filter: If provided, only compute against rows for which the filter is
True
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
- >>> result = df.aggregate([],
[dfn.functions.median(dfn.col("a")).alias("v")])
- >>> result.collect_column("v")[0].as_py()
- 2.0
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.median(
+ ... dfn.col("a")
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 2.0
+
+ >>> df = ctx.from_pydict({"a": [1.0, 1.0, 2.0, 3.0]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.median(
+ ... dfn.col("a"), distinct=True,
+ ... filter=dfn.col("a") < dfn.lit(3.0),
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 1.5
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.median(expression.expr, distinct=distinct,
filter=filter_raw))
@@ -3570,12 +3877,22 @@ def min(expression: Expr, filter: Expr | None = None)
-> Expr:
filter: If provided, only compute against rows for which the filter is
True
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [1, 2, 3]})
- >>> result = df.aggregate([], [dfn.functions.min(dfn.col("a")).alias("v")])
- >>> result.collect_column("v")[0].as_py()
- 1
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [1, 2, 3]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.min(
+ ... dfn.col("a")
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 1
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.min(
+ ... dfn.col("a"),
+ ... filter=dfn.col("a") > dfn.lit(1)
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 2
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.min(expression.expr, filter=filter_raw))
@@ -3597,12 +3914,22 @@ def sum(
filter: If provided, only compute against rows for which the filter is
True
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [1, 2, 3]})
- >>> result = df.aggregate([], [dfn.functions.sum(dfn.col("a")).alias("v")])
- >>> result.collect_column("v")[0].as_py()
- 6
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [1, 2, 3]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.sum(
+ ... dfn.col("a")
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 6
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.sum(
+ ... dfn.col("a"),
+ ... filter=dfn.col("a") > dfn.lit(1)
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 5
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.sum(expression.expr, filter=filter_raw))
@@ -3621,9 +3948,20 @@ def stddev(expression: Expr, filter: Expr | None = None)
-> Expr:
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [2.0, 4.0, 6.0]})
- >>> result = df.aggregate([],
[dfn.functions.stddev(dfn.col("a")).alias("v")])
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.stddev(
+ ... dfn.col("a")
+ ... ).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.stddev(
+ ... dfn.col("a"),
+ ... filter=dfn.col("a") > dfn.lit(2.0)
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 1.41...
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.stddev(expression.expr, filter=filter_raw))
@@ -3641,9 +3979,21 @@ def stddev_pop(expression: Expr, filter: Expr | None =
None) -> Expr:
Examples:
>>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [1.0, 3.0]})
+ >>> df = ctx.from_pydict({"a": [0.0, 1.0, 3.0]})
>>> result = df.aggregate(
- ... [], [dfn.functions.stddev_pop(dfn.col("a")).alias("v")]
+ ... [], [dfn.functions.stddev_pop(
+ ... dfn.col("a")
+ ... ).alias("v")]
+ ... )
+ >>> result.collect_column("v")[0].as_py()
+ 1.247...
+
+ >>> df = ctx.from_pydict({"a": [0.0, 1.0, 3.0]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.stddev_pop(
+ ... dfn.col("a"),
+ ... filter=dfn.col("a") > dfn.lit(0.0)
+ ... ).alias("v")]
... )
>>> result.collect_column("v")[0].as_py()
1.0
@@ -3655,16 +4005,8 @@ def stddev_pop(expression: Expr, filter: Expr | None =
None) -> Expr:
def stddev_samp(arg: Expr, filter: Expr | None = None) -> Expr:
"""Computes the sample standard deviation of the argument.
- This is an alias for :py:func:`stddev`.
-
- Examples:
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [2.0, 4.0, 6.0]})
- >>> result = df.aggregate(
- ... [], [dfn.functions.stddev_samp(dfn.col("a")).alias("v")]
- ... )
- >>> result.collect_column("v")[0].as_py()
- 2.0
+ See Also:
+ This is an alias for :py:func:`stddev`.
"""
return stddev(arg, filter=filter)
@@ -3672,14 +4014,8 @@ def stddev_samp(arg: Expr, filter: Expr | None = None)
-> Expr:
def var(expression: Expr, filter: Expr | None = None) -> Expr:
"""Computes the sample variance of the argument.
- This is an alias for :py:func:`var_samp`.
-
- Examples:
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
- >>> result = df.aggregate([],
[dfn.functions.var(dfn.col("a")).alias("v")])
- >>> result.collect_column("v")[0].as_py()
- 1.0
+ See Also:
+ This is an alias for :py:func:`var_samp`.
"""
return var_samp(expression, filter)
@@ -3696,8 +4032,19 @@ def var_pop(expression: Expr, filter: Expr | None =
None) -> Expr:
Examples:
>>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [0.0, 2.0]})
- >>> result = df.aggregate([],
[dfn.functions.var_pop(dfn.col("a")).alias("v")])
+ >>> df = ctx.from_pydict({"a": [-1.0, 0.0, 2.0]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.var_pop(
+ ... dfn.col("a")
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 1.555...
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.var_pop(
+ ... dfn.col("a"),
+ ... filter=dfn.col("a") > dfn.lit(-1.0)
+ ... ).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
"""
@@ -3718,9 +4065,20 @@ def var_samp(expression: Expr, filter: Expr | None =
None) -> Expr:
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
- >>> result = df.aggregate([],
[dfn.functions.var_samp(dfn.col("a")).alias("v")])
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.var_samp(
+ ... dfn.col("a")
+ ... ).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.var_samp(
+ ... dfn.col("a"),
+ ... filter=dfn.col("a") > dfn.lit(1.0)
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 0.5
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.var_sample(expression.expr, filter=filter_raw))
@@ -3729,16 +4087,8 @@ def var_samp(expression: Expr, filter: Expr | None =
None) -> Expr:
def var_sample(expression: Expr, filter: Expr | None = None) -> Expr:
"""Computes the sample variance of the argument.
- This is an alias for :py:func:`var_samp`.
-
- Examples:
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
- >>> result = df.aggregate(
- ... [], [dfn.functions.var_sample(dfn.col("a")).alias("v")]
- ... )
- >>> result.collect_column("v")[0].as_py()
- 1.0
+ See Also:
+ This is an alias for :py:func:`var_samp`.
"""
return var_samp(expression, filter)
@@ -3765,9 +4115,19 @@ def regr_avgx(
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]})
>>> result = df.aggregate(
- ... [], [dfn.functions.regr_avgx(dfn.col("y"),
dfn.col("x")).alias("v")])
+ ... [], [dfn.functions.regr_avgx(
+ ... dfn.col("y"), dfn.col("x")
+ ... ).alias("v")])
>>> result.collect_column("v")[0].as_py()
5.0
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.regr_avgx(
+ ... dfn.col("y"), dfn.col("x"),
+ ... filter=dfn.col("y") > dfn.lit(1.0)
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 5.5
"""
filter_raw = filter.expr if filter is not None else None
@@ -3796,9 +4156,19 @@ def regr_avgy(
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]})
>>> result = df.aggregate(
- ... [], [dfn.functions.regr_avgy(dfn.col("y"),
dfn.col("x")).alias("v")])
+ ... [], [dfn.functions.regr_avgy(
+ ... dfn.col("y"), dfn.col("x")
+ ... ).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.regr_avgy(
+ ... dfn.col("y"), dfn.col("x"),
+ ... filter=dfn.col("y") > dfn.lit(1.0)
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 2.5
"""
filter_raw = filter.expr if filter is not None else None
@@ -3827,9 +4197,19 @@ def regr_count(
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]})
>>> result = df.aggregate(
- ... [], [dfn.functions.regr_count(dfn.col("y"),
dfn.col("x")).alias("v")])
+ ... [], [dfn.functions.regr_count(
+ ... dfn.col("y"), dfn.col("x")
+ ... ).alias("v")])
>>> result.collect_column("v")[0].as_py()
3
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.regr_count(
+ ... dfn.col("y"), dfn.col("x"),
+ ... filter=dfn.col("y") > dfn.lit(1.0)
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 2
"""
filter_raw = filter.expr if filter is not None else None
@@ -3856,12 +4236,23 @@ def regr_intercept(
Examples:
>>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]})
+ >>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [4.0, 16.0,
36.0]})
>>> result = df.aggregate(
... [],
- ... [dfn.functions.regr_intercept(dfn.col("y"),
dfn.col("x")).alias("v")])
+ ... [dfn.functions.regr_intercept(
+ ... dfn.col("y"), dfn.col("x")
+ ... ).alias("v")])
>>> result.collect_column("v")[0].as_py()
- 0.0
+ 1.714...
+
+ >>> result = df.aggregate(
+ ... [],
+ ... [dfn.functions.regr_intercept(
+ ... dfn.col("y"), dfn.col("x"),
+ ... filter=dfn.col("y") > dfn.lit(2.0)
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 2.4
"""
filter_raw = filter.expr if filter is not None else None
@@ -3888,9 +4279,19 @@ def regr_r2(
Examples:
>>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]})
+ >>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [4.0, 16.0,
36.0]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.regr_r2(
+ ... dfn.col("y"), dfn.col("x")
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 0.9795...
+
>>> result = df.aggregate(
- ... [], [dfn.functions.regr_r2(dfn.col("y"),
dfn.col("x")).alias("v")])
+ ... [], [dfn.functions.regr_r2(
+ ... dfn.col("y"), dfn.col("x"),
+ ... filter=dfn.col("y") > dfn.lit(2.0)
+ ... ).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
"""
@@ -3919,11 +4320,21 @@ def regr_slope(
Examples:
>>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]})
+ >>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [4.0, 16.0,
36.0]})
>>> result = df.aggregate(
- ... [], [dfn.functions.regr_slope(dfn.col("y"),
dfn.col("x")).alias("v")])
+ ... [], [dfn.functions.regr_slope(
+ ... dfn.col("y"), dfn.col("x")
+ ... ).alias("v")])
>>> result.collect_column("v")[0].as_py()
- 2.0
+ 0.122...
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.regr_slope(
+ ... dfn.col("y"), dfn.col("x"),
+ ... filter=dfn.col("y") > dfn.lit(2.0)
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 0.1
"""
filter_raw = filter.expr if filter is not None else None
@@ -3952,9 +4363,19 @@ def regr_sxx(
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]})
>>> result = df.aggregate(
- ... [], [dfn.functions.regr_sxx(dfn.col("y"),
dfn.col("x")).alias("v")])
+ ... [], [dfn.functions.regr_sxx(
+ ... dfn.col("y"), dfn.col("x")
+ ... ).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.regr_sxx(
+ ... dfn.col("y"), dfn.col("x"),
+ ... filter=dfn.col("y") > dfn.lit(1.0)
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 0.5
"""
filter_raw = filter.expr if filter is not None else None
@@ -3983,9 +4404,19 @@ def regr_sxy(
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]})
>>> result = df.aggregate(
- ... [], [dfn.functions.regr_sxy(dfn.col("y"),
dfn.col("x")).alias("v")])
+ ... [], [dfn.functions.regr_sxy(
+ ... dfn.col("y"), dfn.col("x")
+ ... ).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.regr_sxy(
+ ... dfn.col("y"), dfn.col("x"),
+ ... filter=dfn.col("y") > dfn.lit(1.0)
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 0.5
"""
filter_raw = filter.expr if filter is not None else None
@@ -4014,9 +4445,19 @@ def regr_syy(
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]})
>>> result = df.aggregate(
- ... [], [dfn.functions.regr_syy(dfn.col("y"),
dfn.col("x")).alias("v")])
+ ... [], [dfn.functions.regr_syy(
+ ... dfn.col("y"), dfn.col("x")
+ ... ).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.regr_syy(
+ ... dfn.col("y"), dfn.col("x"),
+ ... filter=dfn.col("y") > dfn.lit(1.0)
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 0.5
"""
filter_raw = filter.expr if filter is not None else None
@@ -4047,10 +4488,24 @@ def first_value(
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [10, 20, 30]})
>>> result = df.aggregate(
- ... [], [dfn.functions.first_value(dfn.col("a")).alias("v")]
+ ... [], [dfn.functions.first_value(
+ ... dfn.col("a")
+ ... ).alias("v")]
... )
>>> result.collect_column("v")[0].as_py()
10
+
+ >>> df = ctx.from_pydict({"a": [None, 20, 10]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.first_value(
+ ... dfn.col("a"),
+ ... filter=dfn.col("a") > dfn.lit(10),
+ ... order_by="a",
+ ... null_treatment=dfn.common.NullTreatment.IGNORE_NULLS,
+ ... ).alias("v")]
+ ... )
+ >>> result.collect_column("v")[0].as_py()
+ 20
"""
order_by_raw = sort_list_to_raw_sort_list(order_by)
filter_raw = filter.expr if filter is not None else None
@@ -4089,10 +4544,24 @@ def last_value(
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [10, 20, 30]})
>>> result = df.aggregate(
- ... [], [dfn.functions.last_value(dfn.col("a")).alias("v")]
+ ... [], [dfn.functions.last_value(
+ ... dfn.col("a")
+ ... ).alias("v")]
... )
>>> result.collect_column("v")[0].as_py()
30
+
+ >>> df = ctx.from_pydict({"a": [None, 20, 10]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.last_value(
+ ... dfn.col("a"),
+ ... filter=dfn.col("a") > dfn.lit(10),
+ ... order_by="a",
+ ... null_treatment=dfn.common.NullTreatment.IGNORE_NULLS,
+ ... ).alias("v")]
+ ... )
+ >>> result.collect_column("v")[0].as_py()
+ 20
"""
order_by_raw = sort_list_to_raw_sort_list(order_by)
filter_raw = filter.expr if filter is not None else None
@@ -4133,7 +4602,20 @@ def nth_value(
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [10, 20, 30]})
>>> result = df.aggregate(
- ... [], [dfn.functions.nth_value(dfn.col("a"), 2).alias("v")]
+ ... [], [dfn.functions.nth_value(
+ ... dfn.col("a"), 1
+ ... ).alias("v")]
+ ... )
+ >>> result.collect_column("v")[0].as_py()
+ 10
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.nth_value(
+ ... dfn.col("a"), 1,
+ ... filter=dfn.col("a") > dfn.lit(10),
+ ... order_by="a",
+ ... null_treatment=dfn.common.NullTreatment.IGNORE_NULLS,
+ ... ).alias("v")]
... )
>>> result.collect_column("v")[0].as_py()
20
@@ -4165,12 +4647,23 @@ def bit_and(expression: Expr, filter: Expr | None =
None) -> Expr:
filter: If provided, only compute against rows for which the filter is
True
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [7, 3]})
- >>> result = df.aggregate([],
[dfn.functions.bit_and(dfn.col("a")).alias("v")])
- >>> result.collect_column("v")[0].as_py()
- 3
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [7, 3]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.bit_and(
+ ... dfn.col("a")
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 3
+
+ >>> df = ctx.from_pydict({"a": [7, 5, 3]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.bit_and(
+ ... dfn.col("a"),
+ ... filter=dfn.col("a") > dfn.lit(3)
+ ... ).alias("v")])
+ >>> result.collect_column("v")[0].as_py()
+ 5
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.bit_and(expression.expr, filter=filter_raw))
@@ -4189,12 +4682,25 @@ def bit_or(expression: Expr, filter: Expr | None =
None) -> Expr:
filter: If provided, only compute against rows for which the filter is
True
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [1, 2]})
- >>> result = df.aggregate([],
[dfn.functions.bit_or(dfn.col("a")).alias("v")])
- >>> result.collect_column("v")[0].as_py()
- 3
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [1, 2]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.bit_or(
+ ... dfn.col("a")
+ ... ).alias("v")]
+ ... )
+ >>> result.collect_column("v")[0].as_py()
+ 3
+
+ >>> df = ctx.from_pydict({"a": [1, 2, 4]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.bit_or(
+ ... dfn.col("a"),
+ ... filter=dfn.col("a") > dfn.lit(1)
+ ... ).alias("v")]
+ ... )
+ >>> result.collect_column("v")[0].as_py()
+ 6
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.bit_or(expression.expr, filter=filter_raw))
@@ -4216,12 +4722,25 @@ def bit_xor(
filter: If provided, only compute against rows for which the filter is
True
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [5, 3]})
- >>> result = df.aggregate([],
[dfn.functions.bit_xor(dfn.col("a")).alias("v")])
- >>> result.collect_column("v")[0].as_py()
- 6
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [5, 3]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.bit_xor(
+ ... dfn.col("a")
+ ... ).alias("v")]
+ ... )
+ >>> result.collect_column("v")[0].as_py()
+ 6
+
+ >>> df = ctx.from_pydict({"a": [5, 5, 3]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.bit_xor(
+ ... dfn.col("a"), distinct=True,
+ ... filter=dfn.col("a") > dfn.lit(3),
+ ... ).alias("v")]
+ ... )
+ >>> result.collect_column("v")[0].as_py()
+ 5
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.bit_xor(expression.expr, distinct=distinct,
filter=filter_raw))
@@ -4241,12 +4760,26 @@ def bool_and(expression: Expr, filter: Expr | None =
None) -> Expr:
filter: If provided, only compute against rows for which the filter is
True
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [True, True, False]})
- >>> result = df.aggregate([],
[dfn.functions.bool_and(dfn.col("a")).alias("v")])
- >>> result.collect_column("v")[0].as_py()
- False
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [True, True, False]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.bool_and(
+ ... dfn.col("a")
+ ... ).alias("v")]
+ ... )
+ >>> result.collect_column("v")[0].as_py()
+ False
+
+ >>> df = ctx.from_pydict(
+ ... {"a": [True, True, False], "b": [1, 2, 3]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.bool_and(
+ ... dfn.col("a"),
+ ... filter=dfn.col("b") < dfn.lit(3)
+ ... ).alias("v")]
+ ... )
+ >>> result.collect_column("v")[0].as_py()
+ True
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.bool_and(expression.expr, filter=filter_raw))
@@ -4266,12 +4799,26 @@ def bool_or(expression: Expr, filter: Expr | None =
None) -> Expr:
filter: If provided, only compute against rows for which the filter is
True
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": [False, False, True]})
- >>> result = df.aggregate([],
[dfn.functions.bool_or(dfn.col("a")).alias("v")])
- >>> result.collect_column("v")[0].as_py()
- True
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [False, False, True]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.bool_or(
+ ... dfn.col("a")
+ ... ).alias("v")]
+ ... )
+ >>> result.collect_column("v")[0].as_py()
+ True
+
+ >>> df = ctx.from_pydict(
+ ... {"a": [False, False, True], "b": [1, 2, 3]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.bool_or(
+ ... dfn.col("a"),
+ ... filter=dfn.col("b") < dfn.lit(3)
+ ... ).alias("v")]
+ ... )
+ >>> result.collect_column("v")[0].as_py()
+ False
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.bool_or(expression.expr, filter=filter_raw))
@@ -4318,10 +4865,23 @@ def lead(
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
>>> result = df.select(
- ... dfn.col("a"), dfn.functions.lead(dfn.col("a"), shift_offset=1,
- ... default_value=0, order_by="a").alias("lead"))
+ ... dfn.col("a"),
+ ... dfn.functions.lead(
+ ... dfn.col("a"), shift_offset=1,
+ ... default_value=0, order_by="a"
+ ... ).alias("lead"))
>>> result.sort(dfn.col("a")).collect_column("lead").to_pylist()
[2, 3, 0]
+
+ >>> df = ctx.from_pydict({"g": ["a", "a", "b"], "v": [1, 2, 3]})
+ >>> result = df.select(
+ ... dfn.col("g"), dfn.col("v"),
+ ... dfn.functions.lead(
+ ... dfn.col("v"), shift_offset=1, default_value=0,
+ ... partition_by=dfn.col("g"), order_by="v",
+ ... ).alias("lead"))
+ >>> result.sort(dfn.col("g"),
dfn.col("v")).collect_column("lead").to_pylist()
+ [2, 0, 0]
"""
if not isinstance(default_value, pa.Scalar) and default_value is not None:
default_value = pa.scalar(default_value)
@@ -4378,10 +4938,23 @@ def lag(
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
>>> result = df.select(
- ... dfn.col("a"), dfn.functions.lag(dfn.col("a"), shift_offset=1,
- ... default_value=0, order_by="a").alias("lag"))
+ ... dfn.col("a"),
+ ... dfn.functions.lag(
+ ... dfn.col("a"), shift_offset=1,
+ ... default_value=0, order_by="a"
+ ... ).alias("lag"))
>>> result.sort(dfn.col("a")).collect_column("lag").to_pylist()
[0, 1, 2]
+
+ >>> df = ctx.from_pydict({"g": ["a", "a", "b"], "v": [1, 2, 3]})
+ >>> result = df.select(
+ ... dfn.col("g"), dfn.col("v"),
+ ... dfn.functions.lag(
+ ... dfn.col("v"), shift_offset=1, default_value=0,
+ ... partition_by=dfn.col("g"), order_by="v",
+ ... ).alias("lag"))
+ >>> result.sort(dfn.col("g"),
dfn.col("v")).collect_column("lag").to_pylist()
+ [0, 1, 0]
"""
if not isinstance(default_value, pa.Scalar):
default_value = pa.scalar(default_value)
@@ -4428,9 +5001,22 @@ def row_number(
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [10, 20, 30]})
>>> result = df.select(
- ... dfn.col("a"),
dfn.functions.row_number(order_by="a").alias("rn"))
+ ... dfn.col("a"),
+ ... dfn.functions.row_number(
+ ... order_by="a"
+ ... ).alias("rn"))
>>> result.sort(dfn.col("a")).collect_column("rn").to_pylist()
[1, 2, 3]
+
+ >>> df = ctx.from_pydict(
+ ... {"g": ["a", "a", "b", "b"], "v": [1, 2, 3, 4]})
+ >>> result = df.select(
+ ... dfn.col("g"), dfn.col("v"),
+ ... dfn.functions.row_number(
+ ... partition_by=dfn.col("g"), order_by="v",
+ ... ).alias("rn"))
+ >>> result.sort(dfn.col("g"),
dfn.col("v")).collect_column("rn").to_pylist()
+ [1, 2, 1, 2]
"""
partition_by_raw = expr_list_to_raw_expr_list(partition_by)
order_by_raw = sort_list_to_raw_sort_list(order_by)
@@ -4476,10 +5062,23 @@ def rank(
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [10, 10, 20]})
>>> result = df.select(
- ... dfn.col("a"), dfn.functions.rank(order_by="a").alias("rnk")
+ ... dfn.col("a"),
+ ... dfn.functions.rank(
+ ... order_by="a"
+ ... ).alias("rnk")
... )
>>> result.sort(dfn.col("a")).collect_column("rnk").to_pylist()
[1, 1, 3]
+
+ >>> df = ctx.from_pydict(
+ ... {"g": ["a", "a", "b", "b"], "v": [1, 1, 2, 3]})
+ >>> result = df.select(
+ ... dfn.col("g"), dfn.col("v"),
+ ... dfn.functions.rank(
+ ... partition_by=dfn.col("g"), order_by="v",
+ ... ).alias("rnk"))
+ >>> result.sort(dfn.col("g"),
dfn.col("v")).collect_column("rnk").to_pylist()
+ [1, 1, 1, 2]
"""
partition_by_raw = expr_list_to_raw_expr_list(partition_by)
order_by_raw = sort_list_to_raw_sort_list(order_by)
@@ -4520,9 +5119,22 @@ def dense_rank(
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [10, 10, 20]})
>>> result = df.select(
- ... dfn.col("a"),
dfn.functions.dense_rank(order_by="a").alias("dr"))
+ ... dfn.col("a"),
+ ... dfn.functions.dense_rank(
+ ... order_by="a"
+ ... ).alias("dr"))
>>> result.sort(dfn.col("a")).collect_column("dr").to_pylist()
[1, 1, 2]
+
+ >>> df = ctx.from_pydict(
+ ... {"g": ["a", "a", "b", "b"], "v": [1, 1, 2, 3]})
+ >>> result = df.select(
+ ... dfn.col("g"), dfn.col("v"),
+ ... dfn.functions.dense_rank(
+ ... partition_by=dfn.col("g"), order_by="v",
+ ... ).alias("dr"))
+ >>> result.sort(dfn.col("g"),
dfn.col("v")).collect_column("dr").to_pylist()
+ [1, 1, 1, 2]
"""
partition_by_raw = expr_list_to_raw_expr_list(partition_by)
order_by_raw = sort_list_to_raw_sort_list(order_by)
@@ -4565,9 +5177,22 @@ def percent_rank(
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [10, 20, 30]})
>>> result = df.select(
- ... dfn.col("a"),
dfn.functions.percent_rank(order_by="a").alias("pr"))
+ ... dfn.col("a"),
+ ... dfn.functions.percent_rank(
+ ... order_by="a"
+ ... ).alias("pr"))
>>> result.sort(dfn.col("a")).collect_column("pr").to_pylist()
[0.0, 0.5, 1.0]
+
+ >>> df = ctx.from_pydict(
+ ... {"g": ["a", "a", "a", "b", "b"], "v": [1, 2, 3, 4, 5]})
+ >>> result = df.select(
+ ... dfn.col("g"), dfn.col("v"),
+ ... dfn.functions.percent_rank(
+ ... partition_by=dfn.col("g"), order_by="v",
+ ... ).alias("pr"))
+ >>> result.sort(dfn.col("g"),
dfn.col("v")).collect_column("pr").to_pylist()
+ [0.0, 0.5, 1.0, 0.0, 1.0]
"""
partition_by_raw = expr_list_to_raw_expr_list(partition_by)
order_by_raw = sort_list_to_raw_sort_list(order_by)
@@ -4616,6 +5241,16 @@ def cume_dist(
... )
>>> result.collect_column("cd").to_pylist()
[0.25..., 0.75..., 0.75..., 1.0...]
+
+ >>> df = ctx.from_pydict(
+ ... {"g": ["a", "a", "b", "b"], "v": [1, 2, 3, 4]})
+ >>> result = df.select(
+ ... dfn.col("g"), dfn.col("v"),
+ ... dfn.functions.cume_dist(
+ ... partition_by=dfn.col("g"), order_by="v",
+ ... ).alias("cd"))
+ >>> result.sort(dfn.col("g"),
dfn.col("v")).collect_column("cd").to_pylist()
+ [0.5, 1.0, 0.5, 1.0]
"""
partition_by_raw = expr_list_to_raw_expr_list(partition_by)
order_by_raw = sort_list_to_raw_sort_list(order_by)
@@ -4661,9 +5296,22 @@ def ntile(
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [10, 20, 30, 40]})
>>> result = df.select(
- ... dfn.col("a"), dfn.functions.ntile(2, order_by="a").alias("nt"))
+ ... dfn.col("a"),
+ ... dfn.functions.ntile(
+ ... 2, order_by="a"
+ ... ).alias("nt"))
>>> result.sort(dfn.col("a")).collect_column("nt").to_pylist()
[1, 1, 2, 2]
+
+ >>> df = ctx.from_pydict(
+ ... {"g": ["a", "a", "b", "b"], "v": [1, 2, 3, 4]})
+ >>> result = df.select(
+ ... dfn.col("g"), dfn.col("v"),
+ ... dfn.functions.ntile(
+ ... 2, partition_by=dfn.col("g"), order_by="v",
+ ... ).alias("nt"))
+ >>> result.sort(dfn.col("g"),
dfn.col("v")).collect_column("nt").to_pylist()
+ [1, 2, 1, 2]
"""
partition_by_raw = expr_list_to_raw_expr_list(partition_by)
order_by_raw = sort_list_to_raw_sort_list(order_by)
@@ -4700,13 +5348,23 @@ def string_agg(
column names or expressions.
Examples:
- ---------
- >>> ctx = dfn.SessionContext()
- >>> df = ctx.from_pydict({"a": ["x", "y", "z"]})
- >>> result = df.aggregate(
- ... [], [dfn.functions.string_agg(dfn.col("a"), ",",
order_by="a").alias("s")])
- >>> result.collect_column("s")[0].as_py()
- 'x,y,z'
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": ["x", "y", "z"]})
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.string_agg(
+ ... dfn.col("a"), ",", order_by="a"
+ ... ).alias("s")])
+ >>> result.collect_column("s")[0].as_py()
+ 'x,y,z'
+
+ >>> result = df.aggregate(
+ ... [], [dfn.functions.string_agg(
+ ... dfn.col("a"), ",",
+ ... filter=dfn.col("a") > dfn.lit("x"),
+ ... order_by="a",
+ ... ).alias("s")])
+ >>> result.collect_column("s")[0].as_py()
+ 'y,z'
"""
order_by_raw = sort_list_to_raw_sort_list(order_by)
filter_raw = filter.expr if filter is not None else None
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]