Spaces:

marimo-team
/

marimo-learn

Running

App Files Files Community

Srihari Thyagarajan commited on 1 day ago

Commit

911073a

unverified ·

2 Parent(s): a100272 2604086

Merge pull request #119 from thliang01/Working-with-Apache-Arrow

Browse files

DuckDB : Improve Apache Arrow + DuckDB notebook clarity and technical accuracy

Files changed (1) hide show

duckdb/011_working_with_apache_arrow.py +6 -8

duckdb/011_working_with_apache_arrow.py CHANGED Viewed

@@ -14,7 +14,7 @@
 import marimo
-__generated_with = "0.14.11"
 app = marimo.App(width="medium")
@@ -300,17 +300,15 @@ def _(mo):
         ### Key Benefits:
         - **Memory Efficiency**: Arrow's columnar format uses 20-40% less memory than traditional DataFrames through compact columnar representation and better compression ratios
-        - **Zero-Copy Operations**: Data can be shared between DuckDB and Arrow-compatible systems (Polars, Pandas) without any data copying, eliminating redundant memory usage
         - **Query Performance**: 2-10x faster queries compared to traditional approaches that require data copying
-        - **Larger-than-Memory Analysis**: Since both libraries support streaming query results, you can execute queries on data bigger than available memory by processing one batch at a time
         - **Advanced Query Optimization**: DuckDB's optimizer can push down filters and projections directly into Arrow scans, reading only relevant columns and partitions
         Let's demonstrate these benefits with concrete examples:
         """
     )
     return
 @app.cell(hide_code=True)
 def _(mo):
     mo.md(r"""### Memory Efficiency Demonstration""")
@@ -529,7 +527,6 @@ def _(mo):
 @app.cell
 def _(polars_data, time):
-    import psutil
     import os
     import pyarrow.compute as pc  # Add this import
@@ -554,14 +551,14 @@ def _(polars_data, time):
     # Compare with traditional copy-based operations
     latest_start_time = time.time()
-    # These operations create copies
     pandas_copy = polars_data.to_pandas()
     pandas_sliced = pandas_copy.iloc[:100000].copy()
     pandas_filtered = pandas_copy[pandas_copy['value'] > 500000].copy()
     copy_ops_time = time.time() - latest_start_time
     memory_after_copy = process.memory_info().rss / 1024 / 1024  # MB
     print("Memory Usage Comparison:")
     print(f"Initial memory: {memory_before:.2f} MB")
     print(f"After Arrow operations: {memory_after_arrow:.2f} MB (diff: +{memory_after_arrow - memory_before:.2f} MB)")
@@ -606,6 +603,7 @@ def _():
     import pandas as pd
     import duckdb
     import sqlglot
     return duckdb, mo, pa, pd, pl

 import marimo
+__generated_with = "0.14.12"
 app = marimo.App(width="medium")
         ### Key Benefits:
         - **Memory Efficiency**: Arrow's columnar format uses 20-40% less memory than traditional DataFrames through compact columnar representation and better compression ratios
+        - **Zero-Copy Operations**: Data can be shared between DuckDB and Arrow-compatible systems (Polars, Pandas) without any data copying, eliminating redundant memory usage
         - **Query Performance**: 2-10x faster queries compared to traditional approaches that require data copying
+        - **Larger-than-Memory Analysis**: Both DuckDB and Arrow-compatible libraries support streaming query results, allowing you to execute queries on data larger than available memory by processing data in batches.
         - **Advanced Query Optimization**: DuckDB's optimizer can push down filters and projections directly into Arrow scans, reading only relevant columns and partitions
         Let's demonstrate these benefits with concrete examples:
         """
     )
     return
 @app.cell(hide_code=True)
 def _(mo):
     mo.md(r"""### Memory Efficiency Demonstration""")
 @app.cell
 def _(polars_data, time):
     import os
     import pyarrow.compute as pc  # Add this import
     # Compare with traditional copy-based operations
     latest_start_time = time.time()
+    # These operations may create copies depending on Pandas' Copy-on-Write (CoW) behavior
     pandas_copy = polars_data.to_pandas()
     pandas_sliced = pandas_copy.iloc[:100000].copy()
     pandas_filtered = pandas_copy[pandas_copy['value'] > 500000].copy()
     copy_ops_time = time.time() - latest_start_time
     memory_after_copy = process.memory_info().rss / 1024 / 1024  # MB
     print("Memory Usage Comparison:")
     print(f"Initial memory: {memory_before:.2f} MB")
     print(f"After Arrow operations: {memory_after_arrow:.2f} MB (diff: +{memory_after_arrow - memory_before:.2f} MB)")
     import pandas as pd
     import duckdb
     import sqlglot
+    import psutil
     return duckdb, mo, pa, pd, pl