Srihari Thyagarajan commited on
Commit
a50306f
·
unverified ·
2 Parent(s): 88e688e ce78bc9

Merge pull request #103 from henryharbeck/window

Browse files
polars/12_aggregations.py CHANGED
@@ -8,7 +8,7 @@
8
 
9
  import marimo
10
 
11
- __generated_with = "0.11.14"
12
  app = marimo.App(width="medium")
13
 
14
 
@@ -208,7 +208,7 @@ def _(mo):
208
  r"""
209
  We had more sales in 2014.
210
 
211
- Now let's perform the above operation by groupin with time. This requires sorting the dataframe first.
212
  """
213
  )
214
  return
 
8
 
9
  import marimo
10
 
11
+ __generated_with = "0.12.9"
12
  app = marimo.App(width="medium")
13
 
14
 
 
208
  r"""
209
  We had more sales in 2014.
210
 
211
+ Now let's perform the above operation by grouping with time. This requires sorting the dataframe first.
212
  """
213
  )
214
  return
polars/13_window_functions.py ADDED
@@ -0,0 +1,537 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.13"
3
+ # dependencies = [
4
+ # "duckdb==1.2.2",
5
+ # "marimo",
6
+ # "polars==1.29.0",
7
+ # "pyarrow==20.0.0",
8
+ # "sqlglot==26.16.4",
9
+ # ]
10
+ # ///
11
+
12
+ import marimo
13
+
14
+ __generated_with = "0.13.11"
15
+ app = marimo.App(width="medium", app_title="Window Functions")
16
+
17
+
18
+ @app.cell(hide_code=True)
19
+ def _(mo):
20
+ mo.md(
21
+ r"""
22
+ # Window Functions
23
+ _By [Henry Harbeck](https://github.com/henryharbeck)._
24
+
25
+ In this notebook, you'll learn how to perform different types of window functions in Polars.
26
+ You'll work with partitions, ordering and Polars' available "mapping strategies".
27
+
28
+ We'll use a dataset with a few days of paid and organic digital revenue data.
29
+ """
30
+ )
31
+ return
32
+
33
+
34
+ @app.cell
35
+ def _():
36
+ from datetime import date
37
+
38
+ import polars as pl
39
+
40
+ dates = pl.date_range(date(2025, 2, 1), date(2025, 2, 5), eager=True)
41
+
42
+ df = pl.DataFrame(
43
+ {
44
+ "date": pl.concat([dates, dates]).sort(),
45
+ "channel": ["Paid", "Organic"] * 5,
46
+ "revenue": [6000, 2000, 5200, 4500, 4200, 5900, 3500, 5000, 4800, 4800],
47
+ }
48
+ )
49
+
50
+ df
51
+ return date, df, pl
52
+
53
+
54
+ @app.cell(hide_code=True)
55
+ def _(mo):
56
+ mo.md(
57
+ r"""
58
+ ## What is a window function?
59
+
60
+ A window function performs a calculation across a set of rows that are related to the current row.
61
+ They allow you to perform aggregations and other calculations within a group without collapsing
62
+ the number of rows (opposed to a group by aggregation, which does collapse the number of rows). Typically the result of a
63
+ window function is assigned back to rows within the group, but Polars also offers additional alternatives.
64
+
65
+ Window functions can be used by specifying the [`over`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.over.html)
66
+ method on an expression.
67
+ """
68
+ )
69
+ return
70
+
71
+
72
+ @app.cell(hide_code=True)
73
+ def _(mo):
74
+ mo.md(
75
+ r"""
76
+ ## Partitions
77
+ Partitions are the "group by" columns. We will have one "window" of data per unique value in the partition column(s), to
78
+ which the function will be applied.
79
+ """
80
+ )
81
+ return
82
+
83
+
84
+ @app.cell(hide_code=True)
85
+ def _(mo):
86
+ mo.md(
87
+ r"""
88
+ ### Partitioning by a single column
89
+
90
+ Let's get the total revenue per date...
91
+ """
92
+ )
93
+ return
94
+
95
+
96
+ @app.cell
97
+ def _(df, pl):
98
+ daily_revenue = pl.col("revenue").sum().over("date")
99
+
100
+ df.with_columns(daily_revenue.alias("daily_revenue"))
101
+ return (daily_revenue,)
102
+
103
+
104
+ @app.cell(hide_code=True)
105
+ def _(mo):
106
+ mo.md(r"""And then see what percentage of the daily total was Paid and what percentage was Organic.""")
107
+ return
108
+
109
+
110
+ @app.cell
111
+ def _(daily_revenue, df, pl):
112
+ df.with_columns(daily_revenue_pct=(pl.col("revenue") / daily_revenue))
113
+ return
114
+
115
+
116
+ @app.cell(hide_code=True)
117
+ def _(mo):
118
+ mo.md(
119
+ r"""
120
+ Let's now calculate the maximum revenue, cumulative revenue, rank the revenue and calculate the day-on-day change,
121
+ all partitioned (split) by channel.
122
+ """
123
+ )
124
+ return
125
+
126
+
127
+ @app.cell
128
+ def _(df, pl):
129
+ df.with_columns(
130
+ maximum_revenue=pl.col("revenue").max().over("channel"),
131
+ cumulative_revenue=pl.col("revenue").cum_sum().over("channel"),
132
+ revenue_rank=pl.col("revenue").rank(descending=True).over("channel"),
133
+ day_on_day_change=pl.col("revenue").diff().over("channel"),
134
+ )
135
+ return
136
+
137
+
138
+ @app.cell(hide_code=True)
139
+ def _(mo):
140
+ mo.md(
141
+ r"""
142
+ Note that aggregation functions such as `sum` and `max` have their value applied back to each row in the partition
143
+ (group). Non-aggregate functions such as `cum_sum`, `rank` and `diff` can produce different values per row, but
144
+ still only consider rows within their partition.
145
+ """
146
+ )
147
+ return
148
+
149
+
150
+ @app.cell(hide_code=True)
151
+ def _(mo):
152
+ mo.md(
153
+ r"""
154
+ ### Partitioning by multiple columns
155
+
156
+ We can also partition by multiple columns.
157
+
158
+ Let's add a column to see whether it is a weekday (business day), then get the maximum revenue by that and
159
+ the channel.
160
+ """
161
+ )
162
+ return
163
+
164
+
165
+ @app.cell
166
+ def _(df, pl):
167
+ (
168
+ df.with_columns(
169
+ is_weekday=pl.col("date").dt.is_business_day(),
170
+ ).with_columns(
171
+ max_rev_by_channel_and_weekday=pl.col("revenue").max().over("is_weekday", "channel"),
172
+ )
173
+ )
174
+ return
175
+
176
+
177
+ @app.cell(hide_code=True)
178
+ def _(mo):
179
+ mo.md(
180
+ r"""
181
+ ### Partitioning by expressions
182
+
183
+ Polars also lets you partition by expressions without needing to create them as columns first.
184
+
185
+ So, we could re-write the previous window function as...
186
+ """
187
+ )
188
+ return
189
+
190
+
191
+ @app.cell
192
+ def _(df, pl):
193
+ df.with_columns(
194
+ max_rev_by_channel_and_weekday=pl.col("revenue")
195
+ .max()
196
+ .over((pl.col("date").dt.is_business_day()), "channel")
197
+ )
198
+ return
199
+
200
+
201
+ @app.cell(hide_code=True)
202
+ def _(mo):
203
+ mo.md(
204
+ r"""
205
+ Window functions fit into Polars' composable [expressions API](https://docs.pola.rs/user-guide/concepts/expressions-and-contexts/#expressions),
206
+ so can be combined with all [aggregation methods](https://docs.pola.rs/api/python/stable/reference/expressions/aggregation.html)
207
+ and methods that consider more than 1 row (e.g., `cum_sum`, `rank` and `diff` as we just saw).
208
+ """
209
+ )
210
+ return
211
+
212
+
213
+ @app.cell(hide_code=True)
214
+ def _(mo):
215
+ mo.md(
216
+ r"""
217
+ ## Ordering
218
+
219
+ The `order_by` parameter controls how to order the data within the window. The function is applied to the data in this
220
+ order.
221
+
222
+ Up until this point, we have been letting Polars do the window function calculations based on the order of the rows in the
223
+ DataFrame. There can be times where we would like order of the calculation and the order of the output itself to differ.
224
+ """
225
+ )
226
+ return
227
+
228
+
229
+ @app.cell(hide_code=True)
230
+ def _(mo):
231
+ mo.md(
232
+ """
233
+ ### Ordering in a window function
234
+
235
+ Let's say we want the DataFrame ordered by day of week, but we still want cumulative revenue and the first revenue observation, both
236
+ ordered by date and partitioned by channel...
237
+ """
238
+ )
239
+ return
240
+
241
+
242
+ @app.cell
243
+ def _(df, pl):
244
+ df_sorted = (
245
+ # Monday = 1, Sunday = 7
246
+ df.sort(pl.col("date").dt.weekday())
247
+ # Show the weekday for transparency
248
+ .with_columns(pl.col("date").dt.to_string("%a").alias("weekday"))
249
+ )
250
+
251
+ df_sorted.select(
252
+ "date",
253
+ "weekday",
254
+ "channel",
255
+ "revenue",
256
+ pl.col("revenue").cum_sum().over("channel", order_by="date").alias("cumulative_revenue"),
257
+ pl.col("revenue").first().over("channel", order_by="date").alias("first_revenue"),
258
+ )
259
+ return (df_sorted,)
260
+
261
+
262
+ @app.cell(hide_code=True)
263
+ def _(mo):
264
+ mo.md(
265
+ r"""
266
+ ### Note about window function ordering compared to SQL
267
+
268
+ It is worth noting that traditionally in SQL, many more functions require an `ORDER BY` within `OVER` than in
269
+ equivalent functions in Polars.
270
+
271
+ For example, an SQL `RANK()` expression like...
272
+ """
273
+ )
274
+ return
275
+
276
+
277
+ @app.cell
278
+ def _(df, mo):
279
+ _df = mo.sql(
280
+ f"""
281
+ SELECT
282
+ date,
283
+ channel,
284
+ revenue,
285
+ RANK() OVER (PARTITION BY channel ORDER BY revenue DESC) AS revenue_rank
286
+ FROM df
287
+ -- re-sort the output back to the original order for ease of comparison
288
+ ORDER BY date, channel DESC
289
+ """
290
+ )
291
+ return
292
+
293
+
294
+ @app.cell(hide_code=True)
295
+ def _(mo):
296
+ mo.md(
297
+ r"""
298
+ ...does not require an `order_by` in Polars as the column and the function are already bound (including with the
299
+ `descending=True` argument).
300
+ """
301
+ )
302
+ return
303
+
304
+
305
+ @app.cell
306
+ def _(df, pl):
307
+ df.select(
308
+ "date",
309
+ "channel",
310
+ "revenue",
311
+ revenue_rank=pl.col("revenue").rank(descending=True).over("channel"),
312
+ )
313
+ return
314
+
315
+
316
+ @app.cell(hide_code=True)
317
+ def _(mo):
318
+ mo.md(
319
+ r"""
320
+ ### Descending order
321
+
322
+ We can also order in descending order by passing `descending=True`...
323
+ """
324
+ )
325
+ return
326
+
327
+
328
+ @app.cell
329
+ def _(df_sorted, pl):
330
+ (
331
+ df_sorted.select(
332
+ "date",
333
+ "weekday",
334
+ "channel",
335
+ "revenue",
336
+ pl.col("revenue").cum_sum().over("channel", order_by="date").alias("cumulative_revenue"),
337
+ pl.col("revenue").first().over("channel", order_by="date").alias("first_revenue"),
338
+ pl.col("revenue")
339
+ .first()
340
+ .over("channel", order_by="date", descending=True)
341
+ .alias("last_revenue"),
342
+ # Or, alternatively
343
+ pl.col("revenue").last().over("channel", order_by="date").alias("also_last_revenue"),
344
+ )
345
+ )
346
+ return
347
+
348
+
349
+ @app.cell(hide_code=True)
350
+ def _(mo):
351
+ mo.md(
352
+ """
353
+ ## Mapping Strategies
354
+
355
+ Mapping Strategies control how Polars maps the result of the window function back to the original DataFrame
356
+
357
+ Generally (by default) the result of a window function is assigned back to rows within the group. Through Polars' mapping
358
+ strategies, we will explore other possibilities.
359
+ """
360
+ )
361
+ return
362
+
363
+
364
+ @app.cell(hide_code=True)
365
+ def _(mo):
366
+ mo.md(
367
+ """
368
+ ### Group to rows
369
+
370
+ "group_to_rows" is the default mapping strategy and assigns the result of the window function back to the rows in the
371
+ window.
372
+ """
373
+ )
374
+ return
375
+
376
+
377
+ @app.cell
378
+ def _(df, pl):
379
+ df.with_columns(
380
+ cumulative_revenue=pl.col("revenue").cum_sum().over("channel", mapping_strategy="group_to_rows")
381
+ )
382
+ return
383
+
384
+
385
+ @app.cell(hide_code=True)
386
+ def _(mo):
387
+ mo.md(
388
+ """
389
+ ### Join
390
+
391
+ The "join" mapping strategy aggregates the resulting values in a list and repeats the list for all rows in the group.
392
+ """
393
+ )
394
+ return
395
+
396
+
397
+ @app.cell
398
+ def _(df, pl):
399
+ df.with_columns(
400
+ cumulative_revenue=pl.col("revenue").cum_sum().over("channel", mapping_strategy="join")
401
+ )
402
+ return
403
+
404
+
405
+ @app.cell(hide_code=True)
406
+ def _(mo):
407
+ mo.md(
408
+ r"""
409
+ ### Explode
410
+
411
+ The "explode" mapping strategy is similar to "group_to_rows", but is typically faster and does not preserve the order of
412
+ rows. Due to this, it requires sorting columns (including those not in the window function) for the result to make sense.
413
+ It should also only be used in a `select` context and not `with_columns`.
414
+
415
+ The result of "explode" is similar to a `group_by` followed by an `agg` followed by an `explode`.
416
+ """
417
+ )
418
+ return
419
+
420
+
421
+ @app.cell
422
+ def _(df, pl):
423
+ df.select(
424
+ pl.all().over("channel", order_by="date", mapping_strategy="explode"),
425
+ cumulative_revenue=pl.col("revenue")
426
+ .cum_sum()
427
+ .over("channel", order_by="date", mapping_strategy="explode"),
428
+ )
429
+ return
430
+
431
+
432
+ @app.cell(hide_code=True)
433
+ def _(mo):
434
+ mo.md(r"""Note the modified order of the rows in the output, (but data is the same)...""")
435
+ return
436
+
437
+
438
+ @app.cell(hide_code=True)
439
+ def _(mo):
440
+ mo.md(r"""## Other tips and tricks""")
441
+ return
442
+
443
+
444
+ @app.cell(hide_code=True)
445
+ def _(mo):
446
+ mo.md(
447
+ r"""
448
+ ### Reusing a window
449
+
450
+ In SQL there is a `WINDOW` keyword, which easily allows the re-use of the same window specification across expressions
451
+ without needing to repeat it. In Polars, this can be achieved by using `dict` unpacking to pass arguments to `over`.
452
+ """
453
+ )
454
+ return
455
+
456
+
457
+ @app.cell
458
+ def _(df_sorted, pl):
459
+ window = {
460
+ "partition_by": "date",
461
+ "order_by": "date",
462
+ "mapping_strategy": "group_to_rows",
463
+ }
464
+
465
+ df_sorted.with_columns(
466
+ pct_daily_revenue=(pl.col("revenue") / pl.col("revenue").sum()).over(**window),
467
+ highest_revenue_channel=pl.col("channel").top_k_by("revenue", k=1).first().over(**window),
468
+ daily_revenue_rank=pl.col("revenue").rank().over(**window),
469
+ )
470
+ return
471
+
472
+
473
+ @app.cell(hide_code=True)
474
+ def _(mo):
475
+ mo.md(
476
+ r"""
477
+ ### Rolling Windows
478
+
479
+ Much like in SQL, Polars also gives you the ability to do rolling window computations. In Polars, the rolling calculation
480
+ is also aware of temporal data, making it easy to express if the data is not contiguous (i.e., observations are missing).
481
+
482
+ Let's look at an example of that now by filtering out one day of our data and then calculating both a 3-day and 3-row
483
+ max revenue split by channel...
484
+ """
485
+ )
486
+ return
487
+
488
+
489
+ @app.cell
490
+ def _(date, df, pl):
491
+ (
492
+ df.filter(pl.col("date") != date(2025, 2, 2))
493
+ .with_columns(
494
+ # "3d" -> 3 days
495
+ rev_3_day_max=pl.col("revenue").rolling_max_by("date", "3d", min_samples=1).over("channel"),
496
+ rev_3_row_max=pl.col("revenue").rolling_max(3, min_samples=1).over("channel"),
497
+ )
498
+ # sort to make the output a little easier to analyze
499
+ .sort("channel", "date")
500
+ )
501
+ return
502
+
503
+
504
+ @app.cell(hide_code=True)
505
+ def _(mo):
506
+ mo.md(r"""Notice the difference in the 2nd last row...""")
507
+ return
508
+
509
+
510
+ @app.cell(hide_code=True)
511
+ def _(mo):
512
+ mo.md(r"""We hope you enjoyed this notebook, demonstrating window functions in Polars!""")
513
+ return
514
+
515
+
516
+ @app.cell(hide_code=True)
517
+ def _(mo):
518
+ mo.md(
519
+ r"""
520
+ ## Additional References
521
+
522
+ - [Polars User guide - Window functions](https://docs.pola.rs/user-guide/expressions/window-functions/)
523
+ - [Polars over method API reference](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.over.html)
524
+ - [PostgreSQL window function documentation](https://www.postgresql.org/docs/current/tutorial-window.html)
525
+ """
526
+ )
527
+ return
528
+
529
+
530
+ @app.cell(hide_code=True)
531
+ def _():
532
+ import marimo as mo
533
+ return (mo,)
534
+
535
+
536
+ if __name__ == "__main__":
537
+ app.run()