From 0f57eea8b7b058b491c287dd3cd05ae509741ff9 Mon Sep 17 00:00:00 2001 From: aiswaryarkamath <141299653+aiswaryarkamath@users.noreply.github.com> Date: Tue, 5 Nov 2024 00:00:25 -0500 Subject: [PATCH] Add files via upload --- cleaned_review_sentiment.ipynb | 1158 ++++++++++++++++++++++++++++++++ review_sentiment_code.ipynb | 252 +++++++ 2 files changed, 1410 insertions(+) create mode 100644 cleaned_review_sentiment.ipynb create mode 100644 review_sentiment_code.ipynb diff --git a/cleaned_review_sentiment.ipynb b/cleaned_review_sentiment.ipynb new file mode 100644 index 0000000..d52406d --- /dev/null +++ b/cleaned_review_sentiment.ipynb @@ -0,0 +1,1158 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FEquGu9_xCgO" + }, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "1u8Lh2e_xNmq" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# from google.colab import drive\n", + "# drive.mount('/content/drive')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 304 + }, + "id": "jBWta1fQxW5e", + "outputId": "a1fc5de8-5100-462d-a044-0bcfa618d671" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "error", + "ename": "MessageError", + "evalue": "Error: credential propagation was unsuccessful", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mMessageError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mgoogle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolab\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mdrive\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdrive\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'/content/drive'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/google/colab/drive.py\u001b[0m in \u001b[0;36mmount\u001b[0;34m(mountpoint, force_remount, timeout_ms, readonly)\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmountpoint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforce_remount\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout_ms\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m120000\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreadonly\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 99\u001b[0m \u001b[0;34m\"\"\"Mount your Google Drive at the specified mountpoint path.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 100\u001b[0;31m return _mount(\n\u001b[0m\u001b[1;32m 101\u001b[0m \u001b[0mmountpoint\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 102\u001b[0m \u001b[0mforce_remount\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforce_remount\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/google/colab/drive.py\u001b[0m in \u001b[0;36m_mount\u001b[0;34m(mountpoint, force_remount, timeout_ms, ephemeral, readonly)\u001b[0m\n\u001b[1;32m 135\u001b[0m )\n\u001b[1;32m 136\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mephemeral\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 137\u001b[0;31m _message.blocking_request(\n\u001b[0m\u001b[1;32m 138\u001b[0m \u001b[0;34m'request_auth'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 139\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'authType'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'dfs_ephemeral'\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/google/colab/_message.py\u001b[0m in \u001b[0;36mblocking_request\u001b[0;34m(request_type, request, timeout_sec, parent)\u001b[0m\n\u001b[1;32m 174\u001b[0m \u001b[0mrequest_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparent\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparent\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexpect_reply\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 175\u001b[0m )\n\u001b[0;32m--> 176\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mread_reply_from_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout_sec\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/google/colab/_message.py\u001b[0m in \u001b[0;36mread_reply_from_input\u001b[0;34m(message_id, timeout_sec)\u001b[0m\n\u001b[1;32m 101\u001b[0m ):\n\u001b[1;32m 102\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'error'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mreply\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 103\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mMessageError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreply\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'error'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 104\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mreply\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'data'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 105\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mMessageError\u001b[0m: Error: credential propagation was unsuccessful" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "df = pd.read_csv('/content/drive/MyDrive/ProcessedCSV/processed_data.csv')" + ], + "metadata": { + "id": "iNfVy6ebxNjO" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 469 + }, + "id": "65rlkJcnxNha", + "outputId": "d557f07f-620c-4a9e-e093-84e1902e3fc0" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " _id user_id business_id \\\n", + "0 66ea489ae59c7c5b6d8be1ac AGNUgVwnZUey3gcPCJ76iw 3uLgwr0qeCNMjKenHJwPGQ \n", + "1 66ea489ae59c7c5b6d8be1ad NBN4MgHP9D3cw--SnauTkA QoezRbYQncpRqyrLH6Iqjg \n", + "2 66ea489ae59c7c5b6d8be1ae -copOvldyKh1qr-vzkDEvw MYoRNLb5chwjQe3c_k37Gg \n", + "3 66ea489ae59c7c5b6d8be1af FjMQVZjSqY8syIO-53KFKw hV-bABTK-glh5wj31ps_Jw \n", + "4 66ea489ae59c7c5b6d8be1b0 ld0AperBXk1h6UbqmM80zw _uN0OudeJ3Zl_tf6nxg5ww \n", + "\n", + " text date \\\n", + "0 Avengers time with the ladies. 2012-05-18 02:17:21 \n", + "1 They have lots of good deserts and tasty cuban... 2013-02-05 18:35:10 \n", + "2 It's open even when you think it isn't 2013-08-18 00:56:08 \n", + "3 Very decent fried chicken 2017-06-27 23:05:38 \n", + "4 Appetizers.. platter special for lunch 2012-10-06 19:43:09 \n", + "\n", + " compliment_count sentiment_scores neg \\\n", + "0 0.0 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound... 0.0 \n", + "1 0.0 {'neg': 0.0, 'neu': 0.756, 'pos': 0.244, 'comp... 0.0 \n", + "2 0.0 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound... 0.0 \n", + "3 0.0 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound... 0.0 \n", + "4 0.0 {'neg': 0.0, 'neu': 0.597, 'pos': 0.403, 'comp... 0.0 \n", + "\n", + " neu pos compound sentiment \n", + "0 1.000 0.000 0.0000 neutral \n", + "1 0.756 0.244 0.4404 positive \n", + "2 1.000 0.000 0.0000 neutral \n", + "3 1.000 0.000 0.0000 neutral \n", + "4 0.597 0.403 0.4019 positive " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_iduser_idbusiness_idtextdatecompliment_countsentiment_scoresnegneuposcompoundsentiment
066ea489ae59c7c5b6d8be1acAGNUgVwnZUey3gcPCJ76iw3uLgwr0qeCNMjKenHJwPGQAvengers time with the ladies.2012-05-18 02:17:210.0{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...0.01.0000.0000.0000neutral
166ea489ae59c7c5b6d8be1adNBN4MgHP9D3cw--SnauTkAQoezRbYQncpRqyrLH6IqjgThey have lots of good deserts and tasty cuban...2013-02-05 18:35:100.0{'neg': 0.0, 'neu': 0.756, 'pos': 0.244, 'comp...0.00.7560.2440.4404positive
266ea489ae59c7c5b6d8be1ae-copOvldyKh1qr-vzkDEvwMYoRNLb5chwjQe3c_k37GgIt's open even when you think it isn't2013-08-18 00:56:080.0{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...0.01.0000.0000.0000neutral
366ea489ae59c7c5b6d8be1afFjMQVZjSqY8syIO-53KFKwhV-bABTK-glh5wj31ps_JwVery decent fried chicken2017-06-27 23:05:380.0{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...0.01.0000.0000.0000neutral
466ea489ae59c7c5b6d8be1b0ld0AperBXk1h6UbqmM80zw_uN0OudeJ3Zl_tf6nxg5wwAppetizers.. platter special for lunch2012-10-06 19:43:090.0{'neg': 0.0, 'neu': 0.597, 'pos': 0.403, 'comp...0.00.5970.4030.4019positive
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df" + } + }, + "metadata": {}, + "execution_count": 5 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# drop the unwanted columns\n", + "\n", + "import pandas as pd\n", + "\n", + "df = pd.read_csv('/content/drive/MyDrive/ProcessedCSV/processed_data.csv')\n", + "\n", + "df = df.drop(['compliment_count', 'sentiment_scores', 'neg', 'text', 'pos', 'neu'], axis=1)\n", + "\n", + "df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "suObqpwJxNfd", + "outputId": "6c9e2f45-0669-4154-97c8-3ded47ac3b70" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " _id user_id business_id \\\n", + "0 66ea489ae59c7c5b6d8be1ac AGNUgVwnZUey3gcPCJ76iw 3uLgwr0qeCNMjKenHJwPGQ \n", + "1 66ea489ae59c7c5b6d8be1ad NBN4MgHP9D3cw--SnauTkA QoezRbYQncpRqyrLH6Iqjg \n", + "2 66ea489ae59c7c5b6d8be1ae -copOvldyKh1qr-vzkDEvw MYoRNLb5chwjQe3c_k37Gg \n", + "3 66ea489ae59c7c5b6d8be1af FjMQVZjSqY8syIO-53KFKw hV-bABTK-glh5wj31ps_Jw \n", + "4 66ea489ae59c7c5b6d8be1b0 ld0AperBXk1h6UbqmM80zw _uN0OudeJ3Zl_tf6nxg5ww \n", + "\n", + " date compound sentiment \n", + "0 2012-05-18 02:17:21 0.0000 neutral \n", + "1 2013-02-05 18:35:10 0.4404 positive \n", + "2 2013-08-18 00:56:08 0.0000 neutral \n", + "3 2017-06-27 23:05:38 0.0000 neutral \n", + "4 2012-10-06 19:43:09 0.4019 positive " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_iduser_idbusiness_iddatecompoundsentiment
066ea489ae59c7c5b6d8be1acAGNUgVwnZUey3gcPCJ76iw3uLgwr0qeCNMjKenHJwPGQ2012-05-18 02:17:210.0000neutral
166ea489ae59c7c5b6d8be1adNBN4MgHP9D3cw--SnauTkAQoezRbYQncpRqyrLH6Iqjg2013-02-05 18:35:100.4404positive
266ea489ae59c7c5b6d8be1ae-copOvldyKh1qr-vzkDEvwMYoRNLb5chwjQe3c_k37Gg2013-08-18 00:56:080.0000neutral
366ea489ae59c7c5b6d8be1afFjMQVZjSqY8syIO-53KFKwhV-bABTK-glh5wj31ps_Jw2017-06-27 23:05:380.0000neutral
466ea489ae59c7c5b6d8be1b0ld0AperBXk1h6UbqmM80zw_uN0OudeJ3Zl_tf6nxg5ww2012-10-06 19:43:090.4019positive
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df" + } + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# rename compound column as sentiment score\n", + "\n", + "df = df.rename(columns={'compound': 'sentiment_score'})\n", + "df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "n-g4_szVyIv4", + "outputId": "48bc2b6e-4196-4f73-b1a4-fb2568d47178" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " _id user_id business_id \\\n", + "0 66ea489ae59c7c5b6d8be1ac AGNUgVwnZUey3gcPCJ76iw 3uLgwr0qeCNMjKenHJwPGQ \n", + "1 66ea489ae59c7c5b6d8be1ad NBN4MgHP9D3cw--SnauTkA QoezRbYQncpRqyrLH6Iqjg \n", + "2 66ea489ae59c7c5b6d8be1ae -copOvldyKh1qr-vzkDEvw MYoRNLb5chwjQe3c_k37Gg \n", + "3 66ea489ae59c7c5b6d8be1af FjMQVZjSqY8syIO-53KFKw hV-bABTK-glh5wj31ps_Jw \n", + "4 66ea489ae59c7c5b6d8be1b0 ld0AperBXk1h6UbqmM80zw _uN0OudeJ3Zl_tf6nxg5ww \n", + "\n", + " date sentiment_score sentiment \n", + "0 2012-05-18 02:17:21 0.0000 neutral \n", + "1 2013-02-05 18:35:10 0.4404 positive \n", + "2 2013-08-18 00:56:08 0.0000 neutral \n", + "3 2017-06-27 23:05:38 0.0000 neutral \n", + "4 2012-10-06 19:43:09 0.4019 positive " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_iduser_idbusiness_iddatesentiment_scoresentiment
066ea489ae59c7c5b6d8be1acAGNUgVwnZUey3gcPCJ76iw3uLgwr0qeCNMjKenHJwPGQ2012-05-18 02:17:210.0000neutral
166ea489ae59c7c5b6d8be1adNBN4MgHP9D3cw--SnauTkAQoezRbYQncpRqyrLH6Iqjg2013-02-05 18:35:100.4404positive
266ea489ae59c7c5b6d8be1ae-copOvldyKh1qr-vzkDEvwMYoRNLb5chwjQe3c_k37Gg2013-08-18 00:56:080.0000neutral
366ea489ae59c7c5b6d8be1afFjMQVZjSqY8syIO-53KFKwhV-bABTK-glh5wj31ps_Jw2017-06-27 23:05:380.0000neutral
466ea489ae59c7c5b6d8be1b0ld0AperBXk1h6UbqmM80zw_uN0OudeJ3Zl_tf6nxg5ww2012-10-06 19:43:090.4019positive
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df" + } + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# save csv as review sentiments to the processed csv\n", + "\n", + "df.to_csv('/content/drive/MyDrive/ProcessedCSV/review_sentiments.csv', index=False)" + ], + "metadata": { + "id": "NK7HXoguyNol" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "RQzig1ufykGi" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/review_sentiment_code.ipynb b/review_sentiment_code.ipynb new file mode 100644 index 0000000..204893c --- /dev/null +++ b/review_sentiment_code.ipynb @@ -0,0 +1,252 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "XU9lMuZdVJya", + "outputId": "61bb0d36-5bb4-46bd-accb-a45bfe942fe8" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: pyspark in /usr/local/lib/python3.10/dist-packages (3.5.3)\n", + "Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-packages (from pyspark) (0.10.9.7)\n" + ] + } + ], + "source": [ + "!pip install pyspark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 219 + }, + "id": "Z6QU3P9Gnn_e", + "outputId": "2289cc66-8b31-460b-d711-893542b014de" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + "
\n", + "

SparkSession - in-memory

\n", + " \n", + "
\n", + "

SparkContext

\n", + "\n", + "

Spark UI

\n", + "\n", + "
\n", + "
Version
\n", + "
v3.5.3
\n", + "
Master
\n", + "
local[*]
\n", + "
AppName
\n", + "
MySparkSession
\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 2 + } + ], + "source": [ + "\n", + "from pyspark.sql import SparkSession\n", + "\n", + "spark = SparkSession.builder.appName(\"MySparkSession\").getOrCreate()\n", + "\n", + "# Check if the SparkSession is created successfully\n", + "spark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0Q_wxl8osDnA" + }, + "outputs": [], + "source": [ + "# import pandas as pd\n", + "from textblob import TextBlob\n", + "import nltk\n", + "from nltk.tokenize import sent_tokenize" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pBwb1Nt1tbfA" + }, + "outputs": [], + "source": [ + "df_reviews = spark.read.csv('/content/drive/MyDrive/ProcessedCSV/final_review.csv', header=True, inferSchema=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QItJizQrdxm5" + }, + "outputs": [], + "source": [ + "# reviews.info()\n", + "# tips.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "njh0uOuHTyY_" + }, + "outputs": [], + "source": [ + "# reviews.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pwUQV23TLkcn" + }, + "outputs": [], + "source": [ + "# tips.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2RDxxtJhn7fs" + }, + "outputs": [], + "source": [ + "from pyspark.sql.functions import udf\n", + "from pyspark.sql.types import StringType\n", + "from textblob import TextBlob\n", + "\n", + "# Function to determine sentiment\n", + "def get_sentiment(text):\n", + " analysis = TextBlob(text)\n", + " # Classify sentiment\n", + " if analysis.sentiment.polarity > 0:\n", + " return 'positive'\n", + " elif analysis.sentiment.polarity < 0:\n", + " return 'negative'\n", + " else:\n", + " return 'neutral'\n", + "\n", + "# Create a UDF (User-Defined Function) from the get_sentiment function\n", + "sentiment_udf = udf(get_sentiment, StringType())\n", + "\n", + "# Apply the UDF to the 'text' column\n", + "df_reviews = df_reviews.withColumn('sentiment', sentiment_udf(df_reviews['text']))\n", + "\n", + "# Save the updated DataFrame to a new CSV file\n", + "df_reviews.write.csv('reviews_with_sentiment.csv', header=True, mode='overwrite')\n", + "\n", + "print(\"Sentiment analysis completed and saved to 'reviews_with_sentiment.csv'.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xNUHi2OwoYTk" + }, + "outputs": [], + "source": [ + "# read and display csv file\n", + "df_reviews_with_sentiment = spark.read.csv('/content/reviews_with_sentiment.csv', header=True, inferSchema=True)\n", + "df_reviews_with_sentiment.show()" + ] + }, + { + "cell_type": "code", + "source": [ + "\n", + "# Drop the first row from the DataFrame\n", + "df_reviews_with_sentiment = df_reviews_with_sentiment.filter(df_reviews_with_sentiment.index != 0)\n", + "\n", + "# Show the updated DataFrame\n", + "df_reviews_with_sentiment.show()" + ], + "metadata": { + "id": "wpyC_3Uok9BG" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FhbqCJiroYQG" + }, + "outputs": [], + "source": [ + "# save CSV file in the 'ProcessedCSV' folder\n", + "df_reviews_with_sentiment.write.csv('/content/drive/MyDrive/ProcessedCSV/reviews_with_sentiment.csv', header=True, mode='overwrite')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "A9IZAWrFoYN_" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UIJBifIIoYLP" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "machine_shape": "hm", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file