diff --git a/lime.ipynb b/brick.ipynb similarity index 60% rename from lime.ipynb rename to brick.ipynb index c6de8bc..8a99e6a 100644 --- a/lime.ipynb +++ b/brick.ipynb @@ -4,7 +4,7 @@ "metadata": { "colab": { "provenance": [], - "authorship_tag": "ABX9TyMnPEht5MQJF/hLpcLpa95R", + "authorship_tag": "ABX9TyP7xVaASTvQGBUdHC6gHR/+", "include_colab_link": true }, "kernelspec": { @@ -55,67 +55,59 @@ { "cell_type": "code", "source": [ - "# Necessary imports\n", + "# Import necessary libraries for BRICK or collaborative filtering\n", "import pandas as pd\n", - "from lime.lime_tabular import LimeTabularExplainer\n", - "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n", + "from surprise import Dataset, Reader, SVD\n", + "from surprise.model_selection import train_test_split\n", + "import numpy as np\n", "\n", - "# Train a Random Forest Regressor\n", - "model = RandomForestRegressor(n_estimators=100, random_state=42)\n", - "model.fit(X_train, y_train)\n", + "# Load your data into a DataFrame (df should have 'user_id', 'business_id', and 'rating' columns)\n", + "# df = pd.read_csv('your_data.csv') # Example of loading your data\n", "\n", - "# Evaluate the model\n", - "y_pred = model.predict(X_test)\n", + "# Assuming the DataFrame df contains 'user_id', 'business_id', and 'rating'\n", + "reader = Reader(rating_scale=(1, 5)) # Assuming ratings are on a scale of 1 to 5\n", + "data = Dataset.load_from_df(df[['user_id', 'business_id', 'rating']], reader)\n", "\n", - "# Regression Metrics\n", - "mse = mean_squared_error(y_test, y_pred)\n", - "rmse = mean_squared_error(y_test, y_pred, squared=False) # Root Mean Squared Error\n", - "mae = mean_absolute_error(y_test, y_pred)\n", - "r2 = r2_score(y_test, y_pred)\n", + "# Split the data into training and test sets\n", + "trainset, testset = train_test_split(data, test_size=0.2)\n", + "\n", + "# Train the BRICK recommendation model (using SVD here as an example)\n", + "model = SVD()\n", + "model.fit(trainset)\n", + "\n", + "# Predict ratings for the test set\n", + "y_pred = model.test(testset)\n", + "\n", + "# Extract predictions and true ratings\n", + "y_true = [true_r for (_, _, true_r) in testset]\n", + "y_pred_ratings = [pred.est for (uid, iid, pred) in y_pred]\n", + "\n", + "# Calculate evaluation metrics\n", + "mse = mean_squared_error(y_true, y_pred_ratings)\n", + "rmse = mean_squared_error(y_true, y_pred_ratings, squared=False)\n", + "mae = mean_absolute_error(y_true, y_pred_ratings)\n", + "r2 = r2_score(y_true, y_pred_ratings)\n", "\n", "print(f\"Mean Squared Error (MSE): {mse}\")\n", "print(f\"Root Mean Squared Error (RMSE): {rmse}\")\n", "print(f\"Mean Absolute Error (MAE): {mae}\")\n", "print(f\"R-squared (R2): {r2}\")\n", "\n", - "# LIME interpretation setup\n", - "explainer = LimeTabularExplainer(\n", - " training_data=X_train.values,\n", - " feature_names=feature_columns,\n", - " mode='regression'\n", - ")\n", - "\n", - "# Function to explain a single prediction\n", - "def explain_prediction(instance):\n", - " exp = explainer.explain_instance(\n", - " data_row=instance,\n", - " predict_fn=model.predict\n", - " )\n", - " return exp\n", - "\n", - "# Example usage with a specific instance from X_test\n", - "instance_index = 0 # Change this index for different instances\n", - "explanation = explain_prediction(X_test.values[instance_index])\n", - "\n", - "# Show the explanation\n", - "print(\"LIME Explanation for instance:\")\n", - "explanation.show_in_notebook(show_table=True) # If you're using a Jupyter Notebook\n", - "# Or you can print the explanation details\n", - "print(explanation.as_list())\n", - "\n", "def predict_success_for_business_ids(business_ids, model, features_df, feature_columns):\n", - " # Filter the dataset for the provided business IDs\n", + " # Predict ratings for the given business IDs\n", " input_data = features_df[features_df[\"business_id\"].isin(business_ids)]\n", - " X_input = input_data[feature_columns]\n", + " success_probabilities = []\n", "\n", - " # Predict success probability\n", - " success_probabilities = model.predict(X_input)\n", + " for business_id in business_ids:\n", + " # Use the recommendation model to predict a rating for the business_id\n", + " pred = model.predict(user_id, business_id) # Specify a user_id as needed\n", + " success_probabilities.append(pred.est)\n", "\n", " # Create a DataFrame to display results\n", " result_df = pd.DataFrame({\n", - " \"business_id\": input_data[\"business_id\"],\n", - " \"success_probability\": success_probabilities * 10000\n", + " \"business_id\": business_ids,\n", + " \"success_probability\": success_probabilities\n", " })\n", "\n", " return result_df\n", @@ -123,7 +115,10 @@ "# Example usage\n", "business_ids_list = df['business_id'].tolist() # Create a list of all business ids\n", "business_ids_to_predict = business_ids_list # Use this for prediction\n", - "success_probabilities = predict_success_for_business_ids(business_ids_to_predict, model, df, feature_columns)\n", + "\n", + "# Specify a user_id for the predictions\n", + "user_id = 'your_user_id' # Replace with actual user ID\n", + "success_probabilities = predict_success_for_business_ids(business_ids_to_predict, model, df, ['business_id'])\n", "\n", "print(\"Predicted Success Probabilities:\")\n", "print(success_probabilities)\n", @@ -146,4 +141,4 @@ "outputs": [] } ] -} \ No newline at end of file +} diff --git a/bricks.ipynb b/bricks.ipynb new file mode 100644 index 0000000..5f7d652 --- /dev/null +++ b/bricks.ipynb @@ -0,0 +1,144 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyP7xVaASTvQGBUdHC6gHR/+", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_u28rTWlh3fy", + "outputId": "001c88db-9ab4-4cee-ee26-ce62b3b44b96" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n" + ] + } + ], + "source": [ + "# prompt: code to drive mount\n", + "\n", + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ] + }, + { + "cell_type": "code", + "source": [ + "# Import necessary libraries for BRICK or collaborative filtering\n", + "import pandas as pd\n", + "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n", + "from surprise import Dataset, Reader, SVD\n", + "from surprise.model_selection import train_test_split\n", + "import numpy as np\n", + "\n", + "# Load your data into a DataFrame (df should have 'user_id', 'business_id', and 'rating' columns)\n", + "# df = pd.read_csv('your_data.csv') # Example of loading your data\n", + "\n", + "# Assuming the DataFrame df contains 'user_id', 'business_id', and 'rating'\n", + "reader = Reader(rating_scale=(1, 5)) # Assuming ratings are on a scale of 1 to 5\n", + "data = Dataset.load_from_df(df[['user_id', 'business_id', 'rating']], reader)\n", + "\n", + "# Split the data into training and test sets\n", + "trainset, testset = train_test_split(data, test_size=0.2)\n", + "\n", + "# Train the BRICK recommendation model (using SVD here as an example)\n", + "model = SVD()\n", + "model.fit(trainset)\n", + "\n", + "# Predict ratings for the test set\n", + "y_pred = model.test(testset)\n", + "\n", + "# Extract predictions and true ratings\n", + "y_true = [true_r for (_, _, true_r) in testset]\n", + "y_pred_ratings = [pred.est for (uid, iid, pred) in y_pred]\n", + "\n", + "# Calculate evaluation metrics\n", + "mse = mean_squared_error(y_true, y_pred_ratings)\n", + "rmse = mean_squared_error(y_true, y_pred_ratings, squared=False)\n", + "mae = mean_absolute_error(y_true, y_pred_ratings)\n", + "r2 = r2_score(y_true, y_pred_ratings)\n", + "\n", + "print(f\"Mean Squared Error (MSE): {mse}\")\n", + "print(f\"Root Mean Squared Error (RMSE): {rmse}\")\n", + "print(f\"Mean Absolute Error (MAE): {mae}\")\n", + "print(f\"R-squared (R2): {r2}\")\n", + "\n", + "def predict_success_for_business_ids(business_ids, model, features_df, feature_columns):\n", + " # Predict ratings for the given business IDs\n", + " input_data = features_df[features_df[\"business_id\"].isin(business_ids)]\n", + " success_probabilities = []\n", + "\n", + " for business_id in business_ids:\n", + " # Use the recommendation model to predict a rating for the business_id\n", + " pred = model.predict(user_id, business_id) # Specify a user_id as needed\n", + " success_probabilities.append(pred.est)\n", + "\n", + " # Create a DataFrame to display results\n", + " result_df = pd.DataFrame({\n", + " \"business_id\": business_ids,\n", + " \"success_probability\": success_probabilities\n", + " })\n", + "\n", + " return result_df\n", + "\n", + "# Example usage\n", + "business_ids_list = df['business_id'].tolist() # Create a list of all business ids\n", + "business_ids_to_predict = business_ids_list # Use this for prediction\n", + "\n", + "# Specify a user_id for the predictions\n", + "user_id = 'your_user_id' # Replace with actual user ID\n", + "success_probabilities = predict_success_for_business_ids(business_ids_to_predict, model, df, ['business_id'])\n", + "\n", + "print(\"Predicted Success Probabilities:\")\n", + "print(success_probabilities)\n", + "\n", + "# Substitute a variable instead of directly giving business ID in SQL\n", + "business_df.createOrReplaceTempView(\"business_temp\")\n", + "review_df.createOrReplaceTempView(\"review_temp\")\n", + "checkin_df.createOrReplaceTempView(\"checkin_temp\")\n", + "business_id_to_search = 'SuSEmi52lP8gquHV0XIB9g' # Replace with your variable\n", + "\n", + "# Using SQL queries\n", + "spark.sql(f\"SELECT * FROM business_temp WHERE business_id = '{business_id_to_search}'\").show()\n", + "spark.sql(f\"SELECT * FROM review_temp WHERE business_id = '{business_id_to_search}'\").show()\n", + "spark.sql(f\"SELECT * FROM checkin_temp WHERE business_id = '{business_id_to_search}'\").show()\n" + ], + "metadata": { + "id": "1aNFkM6Th-T-" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file