# Polars & Pandas

| | Polars (2020) | Pandas (2008) |
| ----------------- | -------------------------------------------------------- | --------------------------------------------- |
| data in memory | Apache Arrow | NumPy arrays |
| evaluation | Eager and Lazy (apply query optimization) | Eager |
| index/multi-index | each row is indexed by its integer position in the table | gives a label to each row with an index |
| | no multi-index, will always be a 2D table | can have multi-index |
| parallel | Yes (reading data, transformations, and aggregations...) | Need other libraries or techniques: Dask, ... |
## basic
### select
```python
# pandas
df['a']
df.loc[:, 'a']
# polars
df.select['a']
```
### filter
```python
# pandas
df[df['a'] > 1]
df.loc[df.a >1]
# polars
df.filter[pl.col('a') > 1]
# query optimization:
# if you write multiple filters separately and combine them into a single filter
# pandas
df.query("m2_living > 2500 and price < 300000")
df[(df["m2_living"] > 2500) & (df["price"] < 300000)]
# polars
df.filter(
(pl.col("m2_living") > 2500) & (pl.col("price") < 300000)
)
```
### column assignment
```python
# pandas
df.assign(
tenXValue=lambda df_: df_.value * 10,
hundredXValue=lambda df_: df_.value * 100
)
# polars
df.with_columns(
tenXValue=pl.col("value") * 10,
hundredXValue=pl.col("value") * 100,
)
```
These column assignments are executed in parallel.
### Be lazy
```python
# pandas
df = pd.read_csv(csv_file, usecols=['id1','v1'])
grouped_df = df.loc[:,['id1','v1']].groupby('id1').sum('v1')
# polars
df = pl.scan_csv(csv_file)
grouped_df = df.group_by('id1').agg(pl.col('v1').sum()).collect()
```
- Only read needed columns.
- `.collect()` for executing
- use `read_csv` for eager
## TEST
command: `migrate_conversion_to_audiencerepurchasecycle.py`
客戶導入時匯入訂單完成後執行,用全部訂單計算 audience 回購週期、金額等等。
```python
@print_peak_mem_and_consum_time
def test():
org = Organization.objects.get(abbr="V..")
d_org = org.get_d_org()
conv_qs = d_org.conversion_set.filter(
transaction_status__in=Conversion.success_type()
)
df = generate_df_from_qs(
conv_qs.values_list("user", "datetime", "transaction_revenue"),
columns=["user", "datetime", "transaction_revenue"],
)
df = mapping_corresponding_audience_to_user(d_org, df)
# 算了9個數值
df = df.groupby("ma_audience").agg(
first_purchase_datetime=("datetime", "min"),
last_purchase_datetime=("datetime", "max"),
total_consumption_amount=("transaction_revenue", "sum"),
total_consumption_times=("datetime", "count"),
last_repurchase_cycle=("datetime", cal_last_repurchase_cycle),
)
df["repurchase_datetime"] = df.apply(update_repurchase_datetime, axis=1)
df["average_repurchase_cycle"] = df.apply(update_average_repurchase_cycle, axis=1)
df["average_consumption_amount"] = df.apply(
update_average_consumption_amount, axis=1
)
df["per_year_consumption_amount"], df["per_year_consumption_times"] = zip(
*df.apply(update_per_year_data, axis=1)
)
return df
```
decorator
```python
# decorator to evaluate
def print_peak_mem_and_consum_time(func):
@wraps(func)
def inner(*args, **kwargs):
pass
def inner(*args, **kwargs):
import tracemalloc
import psutil
# Get initial CPU times
process = psutil.Process()
start_cpu_times = process.cpu_times()
start_time = time.time()
tracemalloc.start()
result = func(*args, **kwargs)
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
end_cpu_times = process.cpu_times()
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Memory Current: {current / 1024 / 1024:.4f} MB")
print(f"Memory Peak: {peak / 1024 / 1024:.4f} MB")
print(f"Time: {elapsed_time:.4f} seconds")
return result
return inner
```
Result:
```python
In [37]: org = Organization.objects.get(abbr="V..")
In [38]: d_org = org.get_d_org()
In [42]: conv_qs = d_org.conversion_set.filter(transaction_status__in=Conversion.success_type())
In [43]: conv_qs.count()
Out[43]: 169548
# pandas
In [2]: df = test()
Memory Current: 6.9714 MB
Memory Peak: 55.6687 MB
Time: 147.8921 seconds
In [9]: df = test()
Memory Current: 6.4535 MB
Memory Peak: 55.2806 MB
Time: 125.4330 seconds
# polars
Memory Current: 0.6402 MB
Memory Peak: 42.5357 MB
Time: 26.4460 seconds
Memory Current: 0.6474 MB
Memory Peak: 42.5343 MB
Time: 20.7126 seconds
```