diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..68586da Binary files /dev/null and b/.DS_Store differ diff --git a/.ipynb_checkpoints/sql-python-connection-checkpoint.ipynb b/.ipynb_checkpoints/sql-python-connection-checkpoint.ipynb new file mode 100644 index 0000000..363fcab --- /dev/null +++ b/.ipynb_checkpoints/sql-python-connection-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/sql-python-connection-lab.ipynb b/sql-python-connection-lab.ipynb new file mode 100644 index 0000000..76b1688 --- /dev/null +++ b/sql-python-connection-lab.ipynb @@ -0,0 +1,200 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "id": "c36f412d-502f-4653-a6bb-e6e164089e15", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine, text\n", + "\n", + "# 1) Connect Python to the Sakila DB\n", + "\n", + "#engine = create_engine(\"mysql+pymysql://username:password@localhost:3306/sakila\")\n", + "\n", + "# 2) Pull raw rentals for a given month/year\n", + "def rentals_month(engine, month: int, year: int) -> pd.DataFrame:\n", + " \"\"\"\n", + " Returns rows from sakila.rental for the given month/year as a DataFrame.\n", + " \"\"\"\n", + " query = text(\"\"\"\n", + " SELECT rental_id, rental_date, inventory_id, customer_id, staff_id, return_date\n", + " FROM rental\n", + " WHERE YEAR(rental_date) = :year\n", + " AND MONTH(rental_date) = :month\n", + " ORDER BY rental_date;\n", + " \"\"\")\n", + "\n", + " with engine.connect() as conn:\n", + " df = pd.read_sql(query, conn, params={\"year\": year, \"month\": month})\n", + "\n", + " return df\n", + "\n", + "\n", + "# 3) Count rentals per customer for that month/year\n", + "def rental_count_month(rentals_df: pd.DataFrame, month: int, year: int) -> pd.DataFrame:\n", + " \"\"\"\n", + " Groups rentals_df by customer_id and returns a DataFrame:\n", + " customer_id | rentals_MM_YYYY\n", + " \"\"\"\n", + " col_name = f\"rentals_{month:02d}_{year}\"\n", + " counts = (\n", + " rentals_df\n", + " .groupby(\"customer_id\", as_index=False)\n", + " .size()\n", + " .rename(columns={\"size\": col_name})\n", + " )\n", + " return counts\n", + "\n", + "\n", + "# 4) Compare two month-count DataFrames and compute difference\n", + "def compare_rentals(df_a: pd.DataFrame, df_b: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " where difference = (second month column) - (first month column)\n", + " also filters to customers active in both months (inner join)\n", + " \"\"\"\n", + " # Identify the rentals columns (everything except customer_id)\n", + " col_a = [c for c in df_a.columns if c != \"customer_id\"]\n", + " col_b = [c for c in df_b.columns if c != \"customer_id\"]\n", + " if len(col_a) != 1 or len(col_b) != 1:\n", + " raise ValueError(\"Each input DataFrame must have exactly one rentals_* column plus customer_id.\")\n", + "\n", + " col_a = col_a[0]\n", + " col_b = col_b[0]\n", + "\n", + " merged = df_a.merge(df_b, on=\"customer_id\", how=\"inner\") # active in BOTH months\n", + " merged[\"difference\"] = merged[col_b] - merged[col_a]\n", + " return merged.sort_values([\"difference\", \"customer_id\"], ascending=[False, True]).reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e8ac67f2-4936-4c71-bcae-99bb59c489a5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idrentals_05_2005rentals_06_2005difference
04541109
1213198
2295198
3457198
427187
\n", + "
" + ], + "text/plain": [ + " customer_id rentals_05_2005 rentals_06_2005 difference\n", + "0 454 1 10 9\n", + "1 213 1 9 8\n", + "2 295 1 9 8\n", + "3 457 1 9 8\n", + "4 27 1 8 7" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Example: customers active in BOTH May and June 2005, and how activity differed\n", + "\n", + "may_df = rentals_month(engine, 5, 2005)\n", + "jun_df = rentals_month(engine, 6, 2005)\n", + "may_ct = rental_count_month(may_df, 5, 2005)\n", + "jun_ct = rental_count_month(jun_df, 6, 2005)\n", + "comparison = compare_rentals(may_ct, jun_ct)\n", + "comparison.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a0c519e-ef9a-4e51-9dbc-f41e425c4077", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:base] *", + "language": "python", + "name": "conda-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}