{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "authorship_tag": "ABX9TyMsMDMLbRZOHftVdwNBEnSa", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "code", "source": [ "# import the required libraries\n", "from googleapiclient.discovery import build\n", "from google_auth_oauthlib.flow import InstalledAppFlow\n", "from google.auth.transport.requests import Request\n", "import pickle\n", "import os.path\n", "import base64\n", "import email\n", "from bs4 import BeautifulSoup\n" ], "metadata": { "id": "NFKDciEEMdVx" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Define the SCOPES. If modifying it, delete the token.pickle file.\n", "SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']\n", "\n", "# Variable creds will store the user access token.\n", "# If no valid token found, we will create one.\n", "creds = None\n", "\n", "# The file token.pickle contains the user access token.\n", "# Check if it exists\n", "if os.path.exists('token.pickle'):\n", "\n", "\t# Read the token from the file and store it in the variable creds\n", "\twith open('token.pickle', 'rb') as token:\n", "\t\tcreds = pickle.load(token)\n", "\n", "# If credentials are not available or are invalid, ask the user to log in.\n", "if not creds or not creds.valid:\n", "\tif creds and creds.expired and creds.refresh_token:\n", "\t\tcreds.refresh(Request())\n", "\telse:\n", "\t\tflow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)\n", "\t\tcreds = flow.run_local_server(port=0)\n", "\n", "\t# Save the access token in token.pickle file for the next run\n", "\twith open('token.pickle', 'wb') as token:\n", "\t\tpickle.dump(creds, token)\n", "\n", "\n", "\n" ], "metadata": { "id": "PwmUB5RTCmJB" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from inspect import formatannotationrelativeto\n", "import base64\n", "from googleapiclient.discovery import build\n", "from bs4 import BeautifulSoup\n", "\n", "def extract_text_with_links(soup):\n", " \"\"\" Extracts text from a BeautifulSoup object and retains hyperlinks in a readable format. \"\"\"\n", " for a in soup.find_all('a'):\n", " href = a.get('href', '')\n", " if a.text:\n", " a.replace_with(f\"{a.text} [{href}]\")\n", " else:\n", " a.replace_with(f\"[{href}]\")\n", " return soup.get_text()\n", "\n", "def collect_emails(query):\n", " # Initialize the Gmail API\n", " service = build('gmail', 'v1', credentials=creds)\n", "\n", " # Define the query to fetch emails\n", " #query = 'label:Newsletters after:2024/04/12'\n", "\n", " # Fetch messages from Gmail API\n", " result = service.users().messages().list(maxResults=10, q=query, userId='me').execute()\n", " messages = result.get('messages', [])\n", "\n", " # Write output to a file\n", " with open('email_contents.txt', 'w', encoding='utf-8') as file:\n", " for msg in messages:\n", " txt = service.users().messages().get(userId='me', id=msg['id']).execute()\n", " payload = txt.get('payload', {})\n", " headers = payload.get('headers', [])\n", "\n", " subject = ''\n", " sender = ''\n", " # Extract Subject and Sender from the headers\n", " for d in headers:\n", " if d['name'] == 'Subject':\n", " subject = d['value']\n", " elif d['name'] == 'From':\n", " sender = d['value']\n", "\n", " body_text = ''\n", " # Check if the email has parts (multipart emails)\n", " if 'parts' in payload:\n", " for part in payload['parts']:\n", " mime_type = part.get('mimeType', '')\n", " data = part['body'].get('data', '')\n", " if data:\n", " data = data.replace(\"-\", \"+\").replace(\"_\", \"/\")\n", " decoded_data = base64.urlsafe_b64decode(data)\n", " if mime_type == 'text/plain':\n", " body_text += decoded_data.decode('utf-8')\n", " elif mime_type == 'text/html':\n", " soup = BeautifulSoup(decoded_data, 'lxml')\n", " body_text += extract_text_with_links(soup)\n", "\n", " # Write the formatted email details to the file\n", " file.write(f\"From: {sender}\\nSubject: {subject}\\nMessage:\\n{body_text.strip()}\\n\")\n", " file.write(\"-\" * 80 + \"\\n\") # Separator line\n", " return file" ], "metadata": { "id": "HoayIunYG1Jy" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "collect_emails('label:Newsletters after:2024/04/12')" ], "metadata": { "id": "UJIdlbIXBjez" }, "execution_count": null, "outputs": [] } ] }