{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyMsMDMLbRZOHftVdwNBEnSa",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/milver/Experiments/blob/main/NewsletterSummary.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# import the required libraries\n",
        "from googleapiclient.discovery import build\n",
        "from google_auth_oauthlib.flow import InstalledAppFlow\n",
        "from google.auth.transport.requests import Request\n",
        "import pickle\n",
        "import os.path\n",
        "import base64\n",
        "import email\n",
        "from bs4 import BeautifulSoup\n"
      ],
      "metadata": {
        "id": "NFKDciEEMdVx"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Define the SCOPES. If modifying it, delete the token.pickle file.\n",
        "SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']\n",
        "\n",
        "# Variable creds will store the user access token.\n",
        "# If no valid token found, we will create one.\n",
        "creds = None\n",
        "\n",
        "# The file token.pickle contains the user access token.\n",
        "# Check if it exists\n",
        "if os.path.exists('token.pickle'):\n",
        "\n",
        "\t# Read the token from the file and store it in the variable creds\n",
        "\twith open('token.pickle', 'rb') as token:\n",
        "\t\tcreds = pickle.load(token)\n",
        "\n",
        "# If credentials are not available or are invalid, ask the user to log in.\n",
        "if not creds or not creds.valid:\n",
        "\tif creds and creds.expired and creds.refresh_token:\n",
        "\t\tcreds.refresh(Request())\n",
        "\telse:\n",
        "\t\tflow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)\n",
        "\t\tcreds = flow.run_local_server(port=0)\n",
        "\n",
        "\t# Save the access token in token.pickle file for the next run\n",
        "\twith open('token.pickle', 'wb') as token:\n",
        "\t\tpickle.dump(creds, token)\n",
        "\n",
        "\n",
        "\n"
      ],
      "metadata": {
        "id": "PwmUB5RTCmJB"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from inspect import formatannotationrelativeto\n",
        "import base64\n",
        "from googleapiclient.discovery import build\n",
        "from bs4 import BeautifulSoup\n",
        "\n",
        "def extract_text_with_links(soup):\n",
        "    \"\"\" Extracts text from a BeautifulSoup object and retains hyperlinks in a readable format. \"\"\"\n",
        "    for a in soup.find_all('a'):\n",
        "        href = a.get('href', '')\n",
        "        if a.text:\n",
        "            a.replace_with(f\"{a.text} [{href}]\")\n",
        "        else:\n",
        "            a.replace_with(f\"[{href}]\")\n",
        "    return soup.get_text()\n",
        "\n",
        "def collect_emails(query):\n",
        "  # Initialize the Gmail API\n",
        "  service = build('gmail', 'v1', credentials=creds)\n",
        "\n",
        "  # Define the query to fetch emails\n",
        "  #query = 'label:Newsletters after:2024/04/12'\n",
        "\n",
        "  # Fetch messages from Gmail API\n",
        "  result = service.users().messages().list(maxResults=10, q=query, userId='me').execute()\n",
        "  messages = result.get('messages', [])\n",
        "\n",
        "  # Write output to a file\n",
        "  with open('email_contents.txt', 'w', encoding='utf-8') as file:\n",
        "      for msg in messages:\n",
        "          txt = service.users().messages().get(userId='me', id=msg['id']).execute()\n",
        "          payload = txt.get('payload', {})\n",
        "          headers = payload.get('headers', [])\n",
        "\n",
        "          subject = ''\n",
        "          sender = ''\n",
        "          # Extract Subject and Sender from the headers\n",
        "          for d in headers:\n",
        "              if d['name'] == 'Subject':\n",
        "                  subject = d['value']\n",
        "              elif d['name'] == 'From':\n",
        "                  sender = d['value']\n",
        "\n",
        "          body_text = ''\n",
        "          # Check if the email has parts (multipart emails)\n",
        "          if 'parts' in payload:\n",
        "              for part in payload['parts']:\n",
        "                  mime_type = part.get('mimeType', '')\n",
        "                  data = part['body'].get('data', '')\n",
        "                  if data:\n",
        "                      data = data.replace(\"-\", \"+\").replace(\"_\", \"/\")\n",
        "                      decoded_data = base64.urlsafe_b64decode(data)\n",
        "                      if mime_type == 'text/plain':\n",
        "                          body_text += decoded_data.decode('utf-8')\n",
        "                      elif mime_type == 'text/html':\n",
        "                          soup = BeautifulSoup(decoded_data, 'lxml')\n",
        "                          body_text += extract_text_with_links(soup)\n",
        "\n",
        "          # Write the formatted email details to the file\n",
        "          file.write(f\"From: {sender}\\nSubject: {subject}\\nMessage:\\n{body_text.strip()}\\n\")\n",
        "          file.write(\"-\" * 80 + \"\\n\")  # Separator line\n",
        "  return file"
      ],
      "metadata": {
        "id": "HoayIunYG1Jy"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "collect_emails('label:Newsletters after:2024/04/12')"
      ],
      "metadata": {
        "id": "UJIdlbIXBjez"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}