Spaces:
Running
Running
Merge pull request #119 from thliang01/Working-with-Apache-Arrow
Browse filesDuckDB : Improve Apache Arrow + DuckDB notebook clarity and technical accuracy
duckdb/011_working_with_apache_arrow.py
CHANGED
@@ -14,7 +14,7 @@
|
|
14 |
|
15 |
import marimo
|
16 |
|
17 |
-
__generated_with = "0.14.
|
18 |
app = marimo.App(width="medium")
|
19 |
|
20 |
|
@@ -300,17 +300,15 @@ def _(mo):
|
|
300 |
### Key Benefits:
|
301 |
|
302 |
- **Memory Efficiency**: Arrow's columnar format uses 20-40% less memory than traditional DataFrames through compact columnar representation and better compression ratios
|
303 |
-
- **Zero-Copy Operations**: Data can be shared between DuckDB and Arrow-compatible systems (Polars, Pandas) without any data copying, eliminating redundant memory usage
|
304 |
- **Query Performance**: 2-10x faster queries compared to traditional approaches that require data copying
|
305 |
-
- **Larger-than-Memory Analysis**:
|
306 |
- **Advanced Query Optimization**: DuckDB's optimizer can push down filters and projections directly into Arrow scans, reading only relevant columns and partitions
|
307 |
Let's demonstrate these benefits with concrete examples:
|
308 |
"""
|
309 |
)
|
310 |
return
|
311 |
|
312 |
-
|
313 |
-
|
314 |
@app.cell(hide_code=True)
|
315 |
def _(mo):
|
316 |
mo.md(r"""### Memory Efficiency Demonstration""")
|
@@ -529,7 +527,6 @@ def _(mo):
|
|
529 |
|
530 |
@app.cell
|
531 |
def _(polars_data, time):
|
532 |
-
import psutil
|
533 |
import os
|
534 |
import pyarrow.compute as pc # Add this import
|
535 |
|
@@ -554,14 +551,14 @@ def _(polars_data, time):
|
|
554 |
# Compare with traditional copy-based operations
|
555 |
latest_start_time = time.time()
|
556 |
|
557 |
-
# These operations create copies
|
558 |
pandas_copy = polars_data.to_pandas()
|
559 |
pandas_sliced = pandas_copy.iloc[:100000].copy()
|
560 |
pandas_filtered = pandas_copy[pandas_copy['value'] > 500000].copy()
|
561 |
|
562 |
copy_ops_time = time.time() - latest_start_time
|
563 |
memory_after_copy = process.memory_info().rss / 1024 / 1024 # MB
|
564 |
-
|
565 |
print("Memory Usage Comparison:")
|
566 |
print(f"Initial memory: {memory_before:.2f} MB")
|
567 |
print(f"After Arrow operations: {memory_after_arrow:.2f} MB (diff: +{memory_after_arrow - memory_before:.2f} MB)")
|
@@ -606,6 +603,7 @@ def _():
|
|
606 |
import pandas as pd
|
607 |
import duckdb
|
608 |
import sqlglot
|
|
|
609 |
return duckdb, mo, pa, pd, pl
|
610 |
|
611 |
|
|
|
14 |
|
15 |
import marimo
|
16 |
|
17 |
+
__generated_with = "0.14.12"
|
18 |
app = marimo.App(width="medium")
|
19 |
|
20 |
|
|
|
300 |
### Key Benefits:
|
301 |
|
302 |
- **Memory Efficiency**: Arrow's columnar format uses 20-40% less memory than traditional DataFrames through compact columnar representation and better compression ratios
|
303 |
+
- **Zero-Copy Operations**: Data can be shared between DuckDB and Arrow-compatible systems (Polars, Pandas) without any data copying, eliminating redundant memory usage
|
304 |
- **Query Performance**: 2-10x faster queries compared to traditional approaches that require data copying
|
305 |
+
- **Larger-than-Memory Analysis**: Both DuckDB and Arrow-compatible libraries support streaming query results, allowing you to execute queries on data larger than available memory by processing data in batches.
|
306 |
- **Advanced Query Optimization**: DuckDB's optimizer can push down filters and projections directly into Arrow scans, reading only relevant columns and partitions
|
307 |
Let's demonstrate these benefits with concrete examples:
|
308 |
"""
|
309 |
)
|
310 |
return
|
311 |
|
|
|
|
|
312 |
@app.cell(hide_code=True)
|
313 |
def _(mo):
|
314 |
mo.md(r"""### Memory Efficiency Demonstration""")
|
|
|
527 |
|
528 |
@app.cell
|
529 |
def _(polars_data, time):
|
|
|
530 |
import os
|
531 |
import pyarrow.compute as pc # Add this import
|
532 |
|
|
|
551 |
# Compare with traditional copy-based operations
|
552 |
latest_start_time = time.time()
|
553 |
|
554 |
+
# These operations may create copies depending on Pandas' Copy-on-Write (CoW) behavior
|
555 |
pandas_copy = polars_data.to_pandas()
|
556 |
pandas_sliced = pandas_copy.iloc[:100000].copy()
|
557 |
pandas_filtered = pandas_copy[pandas_copy['value'] > 500000].copy()
|
558 |
|
559 |
copy_ops_time = time.time() - latest_start_time
|
560 |
memory_after_copy = process.memory_info().rss / 1024 / 1024 # MB
|
561 |
+
|
562 |
print("Memory Usage Comparison:")
|
563 |
print(f"Initial memory: {memory_before:.2f} MB")
|
564 |
print(f"After Arrow operations: {memory_after_arrow:.2f} MB (diff: +{memory_after_arrow - memory_before:.2f} MB)")
|
|
|
603 |
import pandas as pd
|
604 |
import duckdb
|
605 |
import sqlglot
|
606 |
+
import psutil
|
607 |
return duckdb, mo, pa, pd, pl
|
608 |
|
609 |
|