diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..68586da Binary files /dev/null and b/.DS_Store differ diff --git a/.ipynb_checkpoints/sql-python-connection-checkpoint.ipynb b/.ipynb_checkpoints/sql-python-connection-checkpoint.ipynb new file mode 100644 index 0000000..363fcab --- /dev/null +++ b/.ipynb_checkpoints/sql-python-connection-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/sql-python-connection-lab.ipynb b/sql-python-connection-lab.ipynb new file mode 100644 index 0000000..76b1688 --- /dev/null +++ b/sql-python-connection-lab.ipynb @@ -0,0 +1,200 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "id": "c36f412d-502f-4653-a6bb-e6e164089e15", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine, text\n", + "\n", + "# 1) Connect Python to the Sakila DB\n", + "\n", + "#engine = create_engine(\"mysql+pymysql://username:password@localhost:3306/sakila\")\n", + "\n", + "# 2) Pull raw rentals for a given month/year\n", + "def rentals_month(engine, month: int, year: int) -> pd.DataFrame:\n", + " \"\"\"\n", + " Returns rows from sakila.rental for the given month/year as a DataFrame.\n", + " \"\"\"\n", + " query = text(\"\"\"\n", + " SELECT rental_id, rental_date, inventory_id, customer_id, staff_id, return_date\n", + " FROM rental\n", + " WHERE YEAR(rental_date) = :year\n", + " AND MONTH(rental_date) = :month\n", + " ORDER BY rental_date;\n", + " \"\"\")\n", + "\n", + " with engine.connect() as conn:\n", + " df = pd.read_sql(query, conn, params={\"year\": year, \"month\": month})\n", + "\n", + " return df\n", + "\n", + "\n", + "# 3) Count rentals per customer for that month/year\n", + "def rental_count_month(rentals_df: pd.DataFrame, month: int, year: int) -> pd.DataFrame:\n", + " \"\"\"\n", + " Groups rentals_df by customer_id and returns a DataFrame:\n", + " customer_id | rentals_MM_YYYY\n", + " \"\"\"\n", + " col_name = f\"rentals_{month:02d}_{year}\"\n", + " counts = (\n", + " rentals_df\n", + " .groupby(\"customer_id\", as_index=False)\n", + " .size()\n", + " .rename(columns={\"size\": col_name})\n", + " )\n", + " return counts\n", + "\n", + "\n", + "# 4) Compare two month-count DataFrames and compute difference\n", + "def compare_rentals(df_a: pd.DataFrame, df_b: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " where difference = (second month column) - (first month column)\n", + " also filters to customers active in both months (inner join)\n", + " \"\"\"\n", + " # Identify the rentals columns (everything except customer_id)\n", + " col_a = [c for c in df_a.columns if c != \"customer_id\"]\n", + " col_b = [c for c in df_b.columns if c != \"customer_id\"]\n", + " if len(col_a) != 1 or len(col_b) != 1:\n", + " raise ValueError(\"Each input DataFrame must have exactly one rentals_* column plus customer_id.\")\n", + "\n", + " col_a = col_a[0]\n", + " col_b = col_b[0]\n", + "\n", + " merged = df_a.merge(df_b, on=\"customer_id\", how=\"inner\") # active in BOTH months\n", + " merged[\"difference\"] = merged[col_b] - merged[col_a]\n", + " return merged.sort_values([\"difference\", \"customer_id\"], ascending=[False, True]).reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e8ac67f2-4936-4c71-bcae-99bb59c489a5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | customer_id | \n", + "rentals_05_2005 | \n", + "rentals_06_2005 | \n", + "difference | \n", + "
|---|---|---|---|---|
| 0 | \n", + "454 | \n", + "1 | \n", + "10 | \n", + "9 | \n", + "
| 1 | \n", + "213 | \n", + "1 | \n", + "9 | \n", + "8 | \n", + "
| 2 | \n", + "295 | \n", + "1 | \n", + "9 | \n", + "8 | \n", + "
| 3 | \n", + "457 | \n", + "1 | \n", + "9 | \n", + "8 | \n", + "
| 4 | \n", + "27 | \n", + "1 | \n", + "8 | \n", + "7 | \n", + "