getml · cyclux · Dec 5, 2025 · Dec 14, 2025
diff --git a/integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md b/integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md
@@ -0,0 +1,47 @@
+# Generate Parquet files from Jaffle Shop CSV data
+
+## Prerequisites
+
+- `pipx`
+- `gcloud` CLI
+
+This script reads the Jaffle Shop CSV files and converts them to Parquet format for more efficient storage and querying in Snowflake.
+
+### Dependencies
+
+Ensure you are in the `integration/jaffle-shop-data` directory and have `uv` set up:
+
+```bash
+cd integration/jaffle-shop-data/
+uv sync
+```
+
+## Generate Jaffle Shop Data (CSV)
+
+To generate the Jaffle Shop CSV data,
+run the following command (in `jaffle-shop-data` directory):
+
+```bash
+pipx run jafgen 6
+```
+
+This will create the necessary CSV files in the `jaffle-data` directory.
+
+## Convert CSV to Parquet
+
+To convert the generated CSV files to Parquet format, run the following script:
+
+```bash
+uv run python convert_jaffle_csv_to_parquet.py
+```
+
+This will read each CSV file from the `jaffle-data` directory and save the corresponding Parquet files in the `jaffle-data/parquet` directory.
+
+## Upload Parquet Files to GCP
+
+To upload the Parquet files to your GCP bucket, use the following commands:
+
+```bash
+gcloud config set project getml-infra
+gcloud storage cp jaffle-data/parquet/*.parquet gs://static.getml.com/datasets/jaffle_shop/
+```
diff --git a/integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py b/integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py
@@ -0,0 +1,37 @@
+from pathlib import Path
+
+import pandas as pd
+
+NAMES: list[str] = [
+    "raw_customers",
+    "raw_items",
+    "raw_orders",
+    "raw_products",
+    "raw_stores",
+    "raw_supplies",
+    "raw_tweets",
+]
+
+JAFFLE_CSV_DATA_PATH = Path("jaffle-data")
+
+if not JAFFLE_CSV_DATA_PATH.exists():
+    raise FileNotFoundError(
+        f"Jaffle CSV data path {JAFFLE_CSV_DATA_PATH} does not exist."
+        " Please run `pipx run jafgen 6` to generate CSVs. (6 years)"
+    )
+
+JAFFLE_PARQUET_DATA_PATH: Path = JAFFLE_CSV_DATA_PATH / "parquet"
+JAFFLE_PARQUET_DATA_PATH.mkdir(parents=True, exist_ok=True)
+
+
+for name in NAMES:
+    csv_filepath = JAFFLE_CSV_DATA_PATH / f"{name}.csv"
+    parquet_filepath = JAFFLE_PARQUET_DATA_PATH / f"{name}.parquet"
+    print(f"Loading {csv_filepath}...")
+
+    df: pd.DataFrame = pd.read_csv(csv_filepath)
+
+    # 'index=False' prevents adding an extra index column
+    df.to_parquet(parquet_filepath, index=False)
+
+    print(f"Converted {name} to parquet format at {parquet_filepath}.")
diff --git a/integration/jaffle-shop-data/pyproject.toml b/integration/jaffle-shop-data/pyproject.toml
@@ -0,0 +1,13 @@
+[project]
+name = "jaffle-shop-parquet-generator"
+version = "0.1.0"
+description = "Convert Jaffle Shop CSV data to Parquet format"
+requires-python = ">=3.12"
+
+dependencies = [
+    "pandas>=2.0.0",
+    "pyarrow>=14.0.0",
+]
+
+[tool.uv]
+package = false