diff --git a/.env.example b/.env.example index 8f0cb921..c220024a 100644 --- a/.env.example +++ b/.env.example @@ -263,6 +263,18 @@ DATABASE_URL=postgres://user:password@localhost:5432/boost_dashboard # See docs/operations/discord_chat_exporter.md (Tyrrrz upstream: Token and IDs, CLI guide). # DISCORD_USER_TOKEN=your.user.token # +# --- Internal Discord user token (compliance-gated) --- +# Do not put user token in .env when using workspace JSON. When enabled, tokens live in +# workspace JSON and are loaded at runtime (not at Django startup). Export can re-extract +# from the Chrome profile when JSON tokens are stale but the browser session is still valid. +# ALLOW_INTERNAL_DISCORD_TOKENS=false +# DISCORD_INTERNAL_TOKENS_JSON= +# Default path: workspace/discord_activity_tracker/discord_internal_tokens.json +# +# Chrome user-data directory (logged-in Discord session on disk): +# DISCORD_CHROME_PROFILE_PATH= +# Default: workspace/discord_activity_tracker/chrome_profile +# # DISCORD_SERVER_ID=987654321098765432 # DISCORD_CONTEXT_REPO_PATH=/absolute/path/to/discord-cplusplus-together-context # DISCORD_CONTEXT_AUTO_COMMIT=false diff --git a/CHANGELOG.md b/CHANGELOG.md index 67601a42..ee341420 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,7 +30,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Resolved five cross-app import tech-debt edges: Pinecone via `cppa_pinecone_sync.sync_api`, dashboard model shim removed, CSV owner lookup via `cppa_user_tracker.services`, clang imports via `github_activity_tracker.sync_api`. - Added **import-linter** contracts and pre-commit hook to prevent regressions. - Enforced **service-layer-only ORM writes** with `scripts/check_service_layer_writes.py` and pre-commit; moved remaining direct writes (repo metadata sync, star bulk-update, GitHub file backfill, BoostVersion import, commit file-change backfill) into `github_activity_tracker.services` / `boost_library_tracker.services`. Allowlist [`.service-layer-write-allowlist.json`](.service-layer-write-allowlist.json) is empty by default for new debt only. -- **slack_event_handler:** Workspace under `workspace/slack_event_handler/`; replace Selenium with `plyvel` + `browser-cookie3` extraction from `CHROME_PROFILE_PATH` (optional Compose `slack-session` / `slack-chromium` noVNC on port 7900 and `manage.py extract_slack_tokens`), store xoxc/xoxd in `slack_internal_tokens.json` with runtime load and automatic re-extract when stale, and remove `slack_session_refresh`, `refresh_slack_tokens`, and the `slack-profile-refresh` compose service. +- **slack_event_handler:** Workspace under `workspace/slack_event_handler/`; huddle support configuration moved to workspace paths. - Pydantic boundary schemas at GitHub, Slack, and Discord ingestion (`api_schemas.py` per app; Discord ChatExporter uses `staging_schema.py`); fetchers validate with `model_validate()`; services accept typed payloads; `classify_failure` maps validation errors to `VALIDATION`. ## [0.1.0] - 2026-05-22 diff --git a/Makefile b/Makefile index 10a7a814..16803880 100644 --- a/Makefile +++ b/Makefile @@ -60,6 +60,14 @@ help: @echo " slack-tokens-reextract Stop chromium → extract JSON" @echo " slack-tokens-refresh Login (noVNC) → wait → extract JSON" @echo "" + @echo " Discord session (user token extraction)" + @echo " discord-login Start discord-chromium (noVNC http://127.0.0.1:7901)" + @echo " discord-wait-profile Wait until Discord login wrote Cookies + LevelDB" + @echo " discord-login-stop Stop discord-chromium before extract" + @echo " extract-discord-tokens Extract token to workspace JSON (one-shot)" + @echo " discord-tokens-reextract Stop chromium → extract JSON" + @echo " discord-tokens-refresh Login (noVNC) → wait → extract JSON" + @echo "" @echo " Utilities" @echo " clean-mac Remove macOS ._* resource-fork files" @echo " clean-pyc Remove compiled Python files" @@ -203,6 +211,36 @@ slack-tokens-reextract: extract-slack-tokens # Login in noVNC, wait for profile files, then extract JSON. slack-tokens-refresh: slack-login slack-wait-profile extract-slack-tokens +# ── Discord session ─────────────────────────────────────────────────────────── + +.PHONY: discord-login discord-wait-profile discord-login-stop extract-discord-tokens \ + discord-tokens-reextract discord-tokens-refresh + +discord-login: + @mkdir -p workspace/discord_activity_tracker/chrome_profile + @rm -f workspace/discord_activity_tracker/chrome_profile/SingletonLock \ + workspace/discord_activity_tracker/chrome_profile/SingletonCookie \ + workspace/discord_activity_tracker/chrome_profile/SingletonSocket + $(COMPOSE) --profile discord-session up -d --force-recreate discord-chromium + @echo "noVNC (password: secret) — Chrome does NOT open automatically:" + @echo " http://127.0.0.1:7901/?autoconnect=1&resize=scale&password=secret" + @echo "Right-click desktop → Web Browsing → Google Chrome → https://discord.com" + @command -v open >/dev/null 2>&1 && open "http://127.0.0.1:7901/?autoconnect=1&resize=scale&password=secret" || true + +discord-wait-profile: + @chmod +x scripts/wait_discord_chrome_profile.sh + @./scripts/wait_discord_chrome_profile.sh + +discord-login-stop: + $(COMPOSE) --profile discord-session stop discord-chromium + +extract-discord-tokens: discord-login-stop + $(MANAGE) extract_discord_tokens + +discord-tokens-reextract: extract-discord-tokens + +discord-tokens-refresh: discord-login discord-wait-profile extract-discord-tokens + # ── Utilities ───────────────────────────────────────────────────────────────── .PHONY: clean-mac diff --git a/SECURITY.md b/SECURITY.md index 0994a145..28e893b9 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -58,7 +58,7 @@ We consider reports for security weaknesses in **this repository** in the follow - **Django application** — web views, authentication and authorization, sessions, CSRF, admin, settings in [`config/settings.py`](config/settings.py), and deployment-related toggles documented in [`.env.example`](.env.example) (for example `USE_X_FORWARDED_HOST`, `USE_TLS_PROXY_HEADERS`, `CSRF_TRUSTED_ORIGINS`, `ALLOWED_HOSTS`). - **Management commands and scheduled work** — collectors and related commands, including behavior under Celery/Celery Beat when used as documented (for example [`docs/Workflow.md`](docs/Workflow.md), `config/boost_collector_schedule.yaml`). - **Credential and secret handling** — how tokens, keys, cookies, and workspace files are read, stored, logged, and passed to subprocesses or external APIs. -- **Integrations** — GitHub API usage; Slack and Discord connectors; Pinecone sync; YouTube API usage; **Chrome profile / session token** flows for Slack huddles (see [`.env.example`](.env.example)). +- **Integrations** — GitHub API usage; Slack and Discord connectors; Pinecone sync; YouTube API usage. - **Workspace and filesystem** — paths under `WORKSPACE_DIR` / `RAW_DIR` and related processing, when failure could lead to arbitrary file access, data leaks, or unsafe deserialization. ### Out of scope @@ -84,11 +84,10 @@ If you operate a deployment and suspect a leak or breach, **rotate** at least th | Category | Examples / environment variables | | --- | --- | | **GitHub** | `GITHUB_TOKEN`, `GITHUB_TOKENS_SCRAPING` (multi-token pool), `GITHUB_TOKEN_WRITE`; PAT-style tokens used by integrations (for example `SLACK_PR_BOT_GITHUB_TOKEN` if it is a PAT) | -| **Slack** | `SLACK_BOT_TOKEN_`, `SLACK_APP_TOKEN_`; if enabled: internal session tokens in `workspace/slack_event_handler/slack_internal_tokens.json` (see `ALLOW_INTERNAL_SLACK_TOKENS` in `.env.example`) | -| **Discord** | `DISCORD_TOKEN` (bot token — supported path); **`DISCORD_USER_TOKEN`** for automation **conflicts with Discord’s Terms of Service** and may result in **account termination** — **rotate and discontinue** use; migrate to bot-based flows where applicable (see [`.env.example`](.env.example) and project docs) | +| **Slack** | `SLACK_BOT_TOKEN_`, `SLACK_APP_TOKEN_` | +| **Discord** | `DISCORD_TOKEN` | | **Pinecone** | `PINECONE_API_KEY`, `PINECONE_PRIVATE_API_KEY`, and any host/index settings that grant write access | | **YouTube** | `YOUTUBE_API_KEY` | -| **Browser session material** | Data derived from **Chrome profiles or cookies** (`CHROME_PROFILE_PATH`, `slack_internal_tokens.json`, and related flows) — treat as secrets; clear or rotate sessions and profiles as appropriate | Also rotate **Django** `SECRET_KEY` and **database** credentials (`DATABASE_URL` or `DB_*`) if there is any chance the application or its configuration was exposed. diff --git a/config/settings.py b/config/settings.py index f9f625dc..c5252763 100644 --- a/config/settings.py +++ b/config/settings.py @@ -455,6 +455,19 @@ def _slack_team_scope_from_env(): # Discord configuration (for discord_activity_tracker) DISCORD_TOKEN = (env("DISCORD_TOKEN", default="") or "").strip() DISCORD_USER_TOKEN = (env("DISCORD_USER_TOKEN", default="") or "").strip() +ALLOW_INTERNAL_DISCORD_TOKENS = ( + env("ALLOW_INTERNAL_DISCORD_TOKENS", default="") or "" +).strip().lower() == "true" +DISCORD_INTERNAL_TOKENS_JSON = ( + env("DISCORD_INTERNAL_TOKENS_JSON", default="") or "" +).strip() +# Chrome user-data dir for Discord user token extraction (logged-in session on disk) +_DEFAULT_DISCORD_CHROME_PROFILE = str( + WORKSPACE_DIR / "discord_activity_tracker" / "chrome_profile" +) +DISCORD_CHROME_PROFILE_PATH = ( + env("DISCORD_CHROME_PROFILE_PATH", default=_DEFAULT_DISCORD_CHROME_PROFILE) or "" +).strip() _discord_server_id_str = (env("DISCORD_SERVER_ID", default="") or "").strip() DISCORD_SERVER_ID: int | None = ( int(_discord_server_id_str) if _discord_server_id_str.isdigit() else None diff --git a/config/test_settings.py b/config/test_settings.py index 91a4922c..d5280b16 100644 --- a/config/test_settings.py +++ b/config/test_settings.py @@ -108,3 +108,8 @@ # Tests patch a single subprocess.Popen for DiscordChatExporter. DISCORD_CHAT_EXPORTER_SEQUENTIAL_EXPORT = False + +# Tests set DISCORD_USER_TOKEN via monkeypatch; do not inherit internal-token mode +# from developer .env (get_or_load_discord_user_token would ignore env token). +ALLOW_INTERNAL_DISCORD_TOKENS = False +DISCORD_USER_TOKEN = "" diff --git a/core/operations/slack_ops/fetcher.py b/core/operations/slack_ops/fetcher.py index 496363cf..fbad0669 100644 --- a/core/operations/slack_ops/fetcher.py +++ b/core/operations/slack_ops/fetcher.py @@ -1,6 +1,6 @@ """ Slack Fetcher: file download, user/channel info, huddle transcript. -Uses SlackAPIClient for API calls; file download and xoxc/xoxd transcript here. +Uses SlackAPIClient for API calls; huddle transcript uses workspace session credentials. """ import os @@ -213,11 +213,10 @@ def download_file(file_url, save_path=None, filename=None, bot_token=None): def fetch_huddle_transcript(file_id): """ - Fetch huddle transcript/file info using xoxc/xoxd from workspace JSON. + Fetch huddle transcript/file info using session credentials from workspace JSON. - Stale JSON tokens with a valid Chrome profile are refreshed automatically via - get_or_load_slack_internal_token_pair (probe + re-extract). On auth errors, - re-extract is attempted once more before giving up. + Stale credentials are refreshed automatically. On auth errors, one refresh retry + is attempted before giving up. """ from slack_event_handler.utils.slack_internal_tokens_store import ( SLACK_TOKENS_RELOGIN_HINT, @@ -234,8 +233,8 @@ def fetch_huddle_transcript(file_id): if not pair: if team_id: logger.error( - "Cannot fetch huddle transcript for file %s: no valid Slack internal " - "tokens for team %s. %s", + "Cannot fetch huddle transcript for file %s: no valid session " + "credentials for team %s. %s", file_id, team_id, SLACK_TOKENS_RELOGIN_HINT, @@ -243,7 +242,7 @@ def fetch_huddle_transcript(file_id): else: logger.error( "Cannot fetch huddle transcript for file %s: no Slack team id " - "(set SLACK_TEAM_IDS) and no valid internal tokens. %s", + "(set SLACK_TEAM_IDS) and no valid session credentials. %s", file_id, SLACK_TOKENS_RELOGIN_HINT, ) @@ -270,7 +269,7 @@ def fetch_huddle_transcript(file_id): if team_id and is_slack_internal_token_auth_error(err) and not reextracted: reextracted = True logger.info( - "Slack auth error (%s); re-extracting tokens from Chrome profile", + "Slack auth error (%s); refreshing session credentials", err, ) new_pair = _extract_validate_and_return(team_id) @@ -280,8 +279,8 @@ def fetch_huddle_transcript(file_id): cookies = {"d": xoxd_token} continue logger.error( - "Cannot fetch huddle transcript for file %s: re-extract from Chrome " - "profile did not yield valid tokens for team %s. %s", + "Cannot fetch huddle transcript for file %s: credential refresh did not " + "yield valid session for team %s. %s", file_id, team_id, SLACK_TOKENS_RELOGIN_HINT, @@ -291,7 +290,7 @@ def fetch_huddle_transcript(file_id): log_slack_internal_tokens_still_invalid(team_id) logger.error( "Cannot fetch huddle transcript for file %s: Slack auth error (%s) " - "after re-extract. %s", + "after credential refresh. %s", file_id, err, SLACK_TOKENS_RELOGIN_HINT, diff --git a/core/tests/operations/test_slack_fetcher.py b/core/tests/operations/test_slack_fetcher.py index 090873f9..a8c055c9 100644 --- a/core/tests/operations/test_slack_fetcher.py +++ b/core/tests/operations/test_slack_fetcher.py @@ -474,7 +474,7 @@ def test_fetch_huddle_auth_error_when_reextract_fails( with caplog.at_level(logging.ERROR): assert fetch_huddle_transcript("Fx") is None _mock_reextract.assert_called_once_with("T1") - assert "slack-tokens-refresh" in caplog.text + assert ".env.example" in caplog.text @override_settings(ALLOW_INTERNAL_SLACK_TOKENS=True) diff --git a/discord_activity_tracker/README.md b/discord_activity_tracker/README.md index 4e6f1012..f54b60e8 100644 --- a/discord_activity_tracker/README.md +++ b/discord_activity_tracker/README.md @@ -10,7 +10,7 @@ Ingests **Discord server activity** (messages, threads, exports) into PostgreSQL ### Where we fetch data -**Discord** via **DiscordChatExporter** (bot/user token + server/channel configuration) within the `--since`/`--until` window, honoring resume semantics documented in the command help. +**Discord** via **DiscordChatExporter** (configured credentials + server/channel configuration) within the `--since`/`--until` window, honoring resume semantics documented in the command help. ### How data is saved to the database @@ -32,7 +32,7 @@ Unless `--skip-pinecone` (or deprecated `--ignore-pinecone`) is set, the run inv ## Main command: `run_discord_activity_tracker` -Orchestrates exporter fetch → DB upsert + raw JSON → Markdown export to `DISCORD_CONTEXT_REPO_PATH` → optional Pinecone via `run_cppa_pinecone_sync`. Requires `DISCORD_USER_TOKEN`, `DISCORD_SERVER_ID`; channel scope from `DISCORD_CHANNEL_IDS` unless `--channels` is set. +Orchestrates exporter fetch → DB upsert + raw JSON → Markdown export to `DISCORD_CONTEXT_REPO_PATH` → optional Pinecone via `run_cppa_pinecone_sync`. Requires configured Discord credentials (see `.env.example`), plus `DISCORD_SERVER_ID`; channel scope from `DISCORD_CHANNEL_IDS` unless `--channels` is set. | Option | Description | | --- | --- | @@ -41,7 +41,7 @@ Orchestrates exporter fetch → DB upsert + raw JSON → Markdown export to `DIS | `--skip-markdown-export` | Skip writing Markdown from the DB to `DISCORD_CONTEXT_REPO_PATH`. | | `--skip-remote-push` | Skip git commit/push after Markdown export (when auto-commit is enabled). | | `--skip-pinecone` / `--ignore-pinecone` | Skip Pinecone upsert for Discord messages (`--ignore-pinecone` is a deprecated alias). | -| `--since`, `--from-date`, `--start-time` | Exporter lower bound (`--after`): `YYYY-MM-DD` or ISO-8601 UTC. If omitted, resumes from latest DB message for the guild (or full history if empty). | +| `--since`, `--from-date`, `--start-time` | Exporter lower bound (`--after`): `YYYY-MM-DD` or ISO-8601 UTC. If omitted, resumes from latest DB message for the guild (or today UTC only if empty). | | `--until`, `--to-date`, `--end-time` | Exporter upper bound (`--before`); same formats. Omitted = through present. | | `--channels` | Comma-separated channel IDs (overrides `DISCORD_CHANNEL_IDS`). | | `--task` | **Deprecated.** `sync` \| `export` \| `all` — prefer `--skip-*` flags. | diff --git a/discord_activity_tracker/management/commands/extract_discord_tokens.py b/discord_activity_tracker/management/commands/extract_discord_tokens.py new file mode 100644 index 00000000..d5fa6929 --- /dev/null +++ b/discord_activity_tracker/management/commands/extract_discord_tokens.py @@ -0,0 +1,63 @@ +""" +Management command: extract_discord_tokens + +Persist Discord session credentials to workspace JSON. +""" + +import logging + +from django.conf import settings +from django.core.management.base import BaseCommand, CommandError + +from discord_activity_tracker.utils.discord_internal_tokens_store import ( + discord_internal_tokens_json_path, + extract_and_save_discord_internal_tokens, +) +from discord_activity_tracker.utils.discord_tokens import ( + _resolve_discord_chrome_profile_root, +) +from discord_activity_tracker.workspace import get_chrome_profile_path + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = ( + "Persist Discord session credentials to " + "workspace/discord_activity_tracker/discord_internal_tokens.json." + ) + + def handle(self, *args, **options): + allow_raw = getattr(settings, "ALLOW_INTERNAL_DISCORD_TOKENS", "") or "" + if isinstance(allow_raw, bool): + allow = allow_raw + else: + allow = str(allow_raw).strip().lower() == "true" + if not allow: + self.stderr.write( + self.style.WARNING( + "Internal Discord session mode is not enabled: credentials will be saved to " + "workspace JSON but ignored by Django until enabled. " + "Restart web/celery after enabling. See .env.example." + ) + ) + + try: + profile = _resolve_discord_chrome_profile_root() + except ValueError as e: + raise CommandError(str(e)) from e + profile_path = str(profile) + if not profile.is_dir(): + raise CommandError( + "Session storage not found " + f"({profile_path}). Expected: {get_chrome_profile_path()}. " + "See .env.example." + ) + + token = extract_and_save_discord_internal_tokens() + if not token: + raise CommandError("Failed to load session credentials. See .env.example.") + out_path = discord_internal_tokens_json_path() + self.stdout.write( + self.style.SUCCESS(f"Saved Discord session credentials to {out_path}.") + ) diff --git a/discord_activity_tracker/management/commands/run_discord_activity_tracker.py b/discord_activity_tracker/management/commands/run_discord_activity_tracker.py index 432b009c..21d33025 100644 --- a/discord_activity_tracker/management/commands/run_discord_activity_tracker.py +++ b/discord_activity_tracker/management/commands/run_discord_activity_tracker.py @@ -18,14 +18,15 @@ 4. **Pinecone** — ``task_discord_pinecone_sync`` when ``PINECONE_DISCORD_*`` are set and ``--skip-pinecone`` is not used. -Required settings for a full sync: ``DISCORD_USER_TOKEN``, ``DISCORD_SERVER_ID``. +Required settings for a full sync: configured Discord credentials (see ``.env.example``), +``DISCORD_SERVER_ID``. Channel scope uses ``DISCORD_CHANNEL_IDS`` unless overridden by ``--channels``. CLI flags are documented on ``Command.add_argument`` ``help=`` strings and in ``docs/service_api/discord_activity_tracker.md``. Raises: - django.core.management.base.CommandError: Missing token/guild, invalid + django.core.management.base.CommandError: Missing credentials/guild, invalid ``--since``/``--until`` parse, or DiscordChatExporter failure (wrapped from ``DiscordChatExporterError``). Other exceptions from the collector may propagate after logging from ``_handle_core``. @@ -65,13 +66,14 @@ latest_message_created_at_for_guild, ) from discord_activity_tracker.sync.chat_exporter import ( + ChannelDayExport, DiscordChatExporterError, _safe_int, convert_exporter_message_to_dict, export_guild_to_json, - filter_discord_export_json_paths, parse_exported_json, ) +from discord_activity_tracker.sync.raw_archive import merge_exporter_json from discord_activity_tracker.sync.messages import _process_messages_in_batches from discord_activity_tracker.workspace import ( clear_exporter_staging_dir, @@ -100,15 +102,19 @@ def _resolve_exporter_date_bounds( *, guild_snowflake: int, channel_ids: list[int], -) -> tuple[datetime | None, datetime | None]: - """Compute ``after_date`` / ``before_date`` in UTC for DiscordChatExporter. +) -> tuple[datetime | None, datetime | None, bool]: + """Compute exporter date bounds and whether incremental mode is per-channel. - - With ``--since``: lower bound is that timestamp. - - Without ``--since``: lower bound is the latest stored ``message_created_at`` for this - guild (scoped to the channel allowlist when set), or ``None`` if the DB has no rows - (full-history export / no ``--after`` filter). + - With ``--since``: lower bound is that timestamp for every channel. + - Without ``--since``: each channel resumes from the UTC day start of its own latest + stored message (overlap re-export; duplicates merged by message id). Channels with + no rows export today (UTC) only. - With ``--until``: upper bound is that timestamp. - Without ``--until``: upper bound is ``None`` (export through the present; no ``--before``). + + Returns ``(after_date, before_date, per_channel_incremental)``. When + ``per_channel_incremental`` is true, ``after_date`` is only used for logging / + checkpoint display (guild-wide latest), not passed to DiscordChatExporter. """ since_s = (options.get("since") or "").strip() or None until_s = (options.get("until") or "").strip() or None @@ -145,7 +151,7 @@ def _resolve_exporter_date_bounds( ) else: logger.debug( - "exporter lower bound: none (--since omitted, empty DB for guild scope)", + "exporter lower bound: today UTC only (--since omitted, empty DB for guild scope)", ) if until is not None: @@ -153,13 +159,14 @@ def _resolve_exporter_date_bounds( else: before_date = None - return after_date, before_date + per_channel_incremental = since is None + return after_date, before_date, per_channel_incremental def task_preprocess_workspace(*, dry_run: bool) -> None: """Ensure ``WORKSPACE_DIR/raw/discord_activity_tracker`` and staging dirs exist.""" - # get_exporter_staging_dir() calls get_raw_dir(); both trees are mkdir'd here. get_exporter_staging_dir() + get_raw_dir() if dry_run: logger.info( "dry-run would ensure raw workspace under %s", @@ -176,6 +183,7 @@ def task_discord_sync( channel_ids: list[int], after_date: datetime | None, before_date: datetime | None, + per_channel_incremental: bool, collector: "DiscordActivityCollector", ) -> int: """DiscordChatExporter → parse → db_sync → archive JSON per channel.""" @@ -194,7 +202,12 @@ def task_discord_sync( clear_exporter_staging_dir() collector.stdout.write("=== Discord sync (fetch → db_sync → save_raw) ===") - if after_date: + if per_channel_incremental: + collector.stdout.write( + "Incremental: per-channel lower bound (UTC day start of latest stored " + "message per channel; duplicates merged by message id)" + ) + elif after_date: collector.stdout.write( f"Incremental: fetching messages after {after_date.isoformat()} UTC" ) @@ -206,23 +219,24 @@ def task_discord_sync( ) try: - json_files = export_guild_to_json( + exports: list[ChannelDayExport] = export_guild_to_json( user_token=user_token, guild_id=guild_id, output_dir=staging, - after_date=after_date, + after_date=after_date if not per_channel_incremental else None, before_date=before_date, channel_ids=channel_ids or None, + per_channel_incremental=per_channel_incremental, ) except DiscordChatExporterError as exc: raise CommandError(f"DiscordChatExporter failed: {exc}") from exc - json_files = filter_discord_export_json_paths(json_files) - - collector.stdout.write(f"Exported {len(json_files)} channel file(s)") + collector.stdout.write(f"Exported {len(exports)} channel-day file(s)") processed_total = 0 - for i, json_path in enumerate(json_files, 1): + for i, export in enumerate(exports, 1): + json_path = export.path + day_str = export.day_str try: data = parse_exported_json(json_path) envelope = validate_envelope(data, source=json_path.name) @@ -240,7 +254,8 @@ def task_discord_sync( continue collector.stdout.write( - f" [{i}/{len(json_files)}] #{ch_name}: {len(messages)} messages" + f" [{i}/{len(exports)}] #{ch_name} / {day_str}: " + f"{len(messages)} message(s) fetched" ) count = asyncio.run( collector._persist_channel(guild_info, channel_info, messages) @@ -248,9 +263,12 @@ def task_discord_sync( processed_total += count channel_raw_dir = get_channel_raw_dir(srv_id, ch_id) - date_tag = after_date.strftime("%Y-%m-%d") if after_date else "full" - dest = channel_raw_dir / f"{date_tag}.json" - json_path.rename(dest) + dest = channel_raw_dir / f"{day_str}.json" + merged_count = merge_exporter_json(dest, data, day=day_str) + collector.stdout.write( + f" archived {merged_count} message(s) -> {dest.name}" + ) + json_path.unlink(missing_ok=True) except StagingValidationError as exc: logger.error( @@ -375,7 +393,7 @@ def load_incremental_state(self) -> IncrementalState | None: guild_id: int | None = getattr(settings, "DISCORD_SERVER_ID", None) if not guild_id: return None - after_date, _before = _resolve_exporter_date_bounds( + after_date, _before, _per_ch = _resolve_exporter_date_bounds( self.options, guild_snowflake=guild_id, channel_ids=self.channel_ids, @@ -452,7 +470,7 @@ class Command(BaseCollectorCommand): restrict channels and skip Pinecone. Raises: - CommandError: If ``DISCORD_USER_TOKEN`` or ``DISCORD_SERVER_ID`` is unset, or + CommandError: If Discord credentials or ``DISCORD_SERVER_ID`` is unset, or date options fail to parse, or DiscordChatExporter fails (see ``task_discord_sync``). See Also: @@ -509,7 +527,7 @@ def add_arguments(self, parser): dest="since", help="Exporter lower bound (--after): YYYY-MM-DD or ISO-8601 (UTC). " "If omitted, uses the latest message time already in the DB for this guild " - "(and channel allowlist), or full history when the DB has no rows.", + "(and channel allowlist), or today (UTC) only when the DB has no rows.", ) parser.add_argument( "--until", @@ -569,19 +587,25 @@ def _handle_core( } ) - user_token = (getattr(settings, "DISCORD_USER_TOKEN", "") or "").strip() + from discord_activity_tracker.utils.discord_internal_tokens_store import ( + get_or_load_discord_user_token, + ) + + user_token = get_or_load_discord_user_token() guild_id: int | None = getattr(settings, "DISCORD_SERVER_ID", None) if not user_token: - raise CommandError("DISCORD_USER_TOKEN not configured.") + raise CommandError("Discord credentials not configured. See .env.example.") if not guild_id: raise CommandError("DISCORD_SERVER_ID not configured.") try: - after_date, before_date = _resolve_exporter_date_bounds( - options, - guild_snowflake=guild_id, - channel_ids=collector.channel_ids, + after_date, before_date, per_channel_incremental = ( + _resolve_exporter_date_bounds( + options, + guild_snowflake=guild_id, + channel_ids=collector.channel_ids, + ) ) except CommandError: raise @@ -617,13 +641,18 @@ def _handle_core( collector.stdout.write( f" Channel allowlist: {collector.channel_ids or 'all channels'}" ) - if after_date: + if per_channel_incremental: + collector.stdout.write( + " Lower bound (--after): per-channel (UTC day of latest " + "stored message; empty channel = today UTC only)" + ) + elif after_date: collector.stdout.write( f" Lower bound (--after): {after_date.isoformat()} UTC" ) else: collector.stdout.write( - " Lower bound (--after): none (full history; empty DB or no since)" + " Lower bound (--after): today (UTC) only (empty DB, no --since)" ) if before_date: collector.stdout.write( @@ -646,6 +675,7 @@ def _handle_core( channel_ids=collector.channel_ids, after_date=after_date, before_date=before_date, + per_channel_incremental=per_channel_incremental, collector=collector, ) diff --git a/discord_activity_tracker/sync/chat_exporter.py b/discord_activity_tracker/sync/chat_exporter.py index 2551123f..bfa83409 100644 --- a/discord_activity_tracker/sync/chat_exporter.py +++ b/discord_activity_tracker/sync/chat_exporter.py @@ -1,4 +1,4 @@ -"""DiscordChatExporter CLI wrapper for user token-based scraping.""" +"""DiscordChatExporter CLI wrapper for configured exporter credentials.""" from __future__ import annotations @@ -10,6 +10,7 @@ import shutil import subprocess import sys +from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Sequence @@ -21,6 +22,7 @@ from discord_activity_tracker.protocol_impl import DiscordActivityRecord +from .exporter_window import iter_channel_export_days, resolve_channel_export_after from .utils import format_discord_url from ..workspace import get_workspace_root @@ -63,6 +65,15 @@ class DiscordChatExporterError(Exception): pass +@dataclass(frozen=True) +class ChannelDayExport: + """One DiscordChatExporter JSON file for a channel and UTC calendar day.""" + + path: Path + day_str: str + channel_id: int + + def _default_cli_basename() -> str: """DiscordChatExporter ships ``.exe`` on Windows and extensionless ``DiscordChatExporter.Cli`` on macOS/Linux.""" if sys.platform == "win32": @@ -444,19 +455,23 @@ def _append_export_window( ) -def _export_guild_sequential( +def _is_empty_channel_export_error(message: str) -> bool: + """True when DiscordChatExporter reports no messages for the requested window.""" + lower = message.lower() + return ( + "no messages" in lower + or "channel is empty" in lower + or "does not contain" in lower + ) + + +def _resolve_export_channel_ids( cli_path: Path, user_token: str, guild_id: int, - output_dir: Path, - after_date: Optional[datetime], - before_date: Optional[datetime], include_threads: str, channel_ids: Optional[Sequence[int]], -) -> None: - logger.info( - "DiscordChatExporter sequential mode (DISCORD_CHAT_EXPORTER_SEQUENTIAL_EXPORT)" - ) +) -> List[int]: if channel_ids: seen: set[int] = set() ids: List[int] = [] @@ -469,73 +484,127 @@ def _export_guild_sequential( "export runs directly (avoids OOM/SIGKILL on huge guilds)", len(ids), ) - else: - raw_ids = _run_channels_listing( + return ids + return list( + _run_channels_listing( cli_path, user_token, guild_id, include_threads=include_threads ) - ids = list(raw_ids) - if not ids: - raise DiscordChatExporterError( - "No channels to export after listing the guild (check DISCORD_CHANNEL_IDS / " - "--channels filter, token access, or INCLUDE_VC if you need voice channels)." - ) - logger.info("Exporting %d channel(s) one process at a time", len(ids)) - for ch_id in ids: - # `export` (per-channel) does not support --include-threads or --respect-rate-limits - # in DiscordChatExporter 2.40+; thread inclusion applies to `channels` / `exportguild` only. - cmd = _cli_argv_head(cli_path) + [ - "export", - "--token", - user_token, - "--channel", - str(ch_id), - "--output", - str(output_dir) + os.sep, - "--format", - "Json", - "--parallel", - "1", - "--markdown", - "True", - ] - _append_export_window(cmd, after_date, before_date) - logger.info("Running DiscordChatExporter export for channel %s", ch_id) - _run_exporter_streaming(cmd, cli_path=cli_path) - logger.info("Sequential export completed successfully") - - -def _export_guild_exportguild( + ) + + +def export_channel_window_to_json( cli_path: Path, user_token: str, - guild_id: int, - output_dir: Path, + channel_id: int, + output_path: Path, after_date: Optional[datetime], before_date: Optional[datetime], - include_threads: str, -) -> None: +) -> Path: + """Run DiscordChatExporter ``export`` for one channel and time window.""" + output_path.parent.mkdir(parents=True, exist_ok=True) cmd = _cli_argv_head(cli_path) + [ - "exportguild", + "export", "--token", user_token, - "--guild", - str(guild_id), + "--channel", + str(channel_id), "--output", - str(output_dir) + os.sep, + str(output_path), "--format", "Json", - "--include-threads", - include_threads, - "--include-vc", - _cli_bool(_get_include_voice_channels()), "--parallel", - str(_get_parallel_workers()), + "1", "--markdown", "True", ] _append_export_window(cmd, after_date, before_date) - logger.info("Running DiscordChatExporter exportguild for guild %s", guild_id) + logger.info( + "Running DiscordChatExporter export for channel %s -> %s", + channel_id, + output_path.name, + ) _run_exporter_streaming(cmd, cli_path=cli_path) - logger.info("Exportguild completed successfully") + return output_path + + +def _export_guild_by_channel_day( + cli_path: Path, + user_token: str, + guild_id: int, + output_dir: Path, + after_date: Optional[datetime], + before_date: Optional[datetime], + include_threads: str, + channel_ids: Optional[Sequence[int]], + *, + per_channel_incremental: bool = False, +) -> List[ChannelDayExport]: + """Export each channel for each UTC day in the resolved window.""" + ids = _resolve_export_channel_ids( + cli_path, user_token, guild_id, include_threads, channel_ids + ) + if not ids: + raise DiscordChatExporterError( + "No channels to export after listing the guild (check DISCORD_CHANNEL_IDS / " + "--channels filter, token access, or INCLUDE_VC if you need voice channels)." + ) + + explicit_after = after_date + results: List[ChannelDayExport] = [] + for ch_id in ids: + ch_after = resolve_channel_export_after( + guild_id, + ch_id, + explicit_after=explicit_after, + ) + days = iter_channel_export_days(after=ch_after, before=before_date) + if not days: + logger.debug("No UTC day windows for channel %s", ch_id) + continue + logger.info( + "Exporting channel %s x %d UTC day(s) (after=%s)", + ch_id, + len(days), + ch_after.isoformat() if ch_after else "none", + ) + for day_str, window_after, window_before in days: + output_path = output_dir / f"{ch_id}_{day_str}.json" + try: + export_channel_window_to_json( + cli_path, + user_token, + ch_id, + output_path, + window_after, + window_before, + ) + except DiscordChatExporterError as exc: + if _is_empty_channel_export_error(str(exc)): + logger.info( + "No messages for channel %s on %s UTC, skipping", + ch_id, + day_str, + ) + output_path.unlink(missing_ok=True) + continue + raise + if output_path.is_file(): + results.append( + ChannelDayExport( + path=output_path, + day_str=day_str, + channel_id=ch_id, + ) + ) + else: + logger.debug( + "DiscordChatExporter produced no file for channel %s on %s", + ch_id, + day_str, + ) + + logger.info("Per-channel per-day export completed (%d file(s))", len(results)) + return results def filter_discord_export_json_paths(paths: Iterable[Path]) -> List[Path]: @@ -556,8 +625,10 @@ def export_guild_to_json( before_date: Optional[datetime] = None, include_threads: str = "None", channel_ids: Optional[Sequence[int]] = None, -) -> List[Path]: - """Export all channels from a guild. Returns list of JSON file paths.""" + *, + per_channel_incremental: bool = False, +) -> List[ChannelDayExport]: + """Export guild channels one UTC day at a time. Returns per-day export descriptors.""" from django.conf import settings cli_path = _get_cli_path() @@ -587,28 +658,55 @@ def export_guild_to_json( output_dir.mkdir(parents=True, exist_ok=True) + export_results: List[ChannelDayExport] = [] + + def _run_export(active_token: str) -> None: + nonlocal export_results + export_results = _export_guild_by_channel_day( + cli_path, + active_token, + guild_id, + output_dir, + after_date, + before_date, + include_threads, + channel_ids, + per_channel_incremental=per_channel_incremental, + ) + try: - if _get_sequential_export(): - _export_guild_sequential( - cli_path, - user_token, - guild_id, - output_dir, - after_date, - before_date, - include_threads, - channel_ids, + try: + _run_export(user_token) + except DiscordChatExporterError as exc: + from discord_activity_tracker.utils.discord_internal_tokens_store import ( + DISCORD_TOKENS_RELOGIN_HINT, + extract_and_save_discord_internal_tokens, ) - else: - _export_guild_exportguild( - cli_path, - user_token, - guild_id, - output_dir, - after_date, - before_date, - include_threads, + from discord_activity_tracker.utils.discord_tokens import ( + is_discord_exporter_auth_error, ) + + allow_internal = getattr(settings, "ALLOW_INTERNAL_DISCORD_TOKENS", False) + if isinstance(allow_internal, str): + allow_internal = allow_internal.strip().lower() == "true" + if allow_internal and is_discord_exporter_auth_error(str(exc)): + logger.info( + "DiscordChatExporter auth failure; refreshing session credentials" + ) + refreshed = extract_and_save_discord_internal_tokens() + if refreshed and refreshed != user_token: + logger.info( + "Retrying DiscordChatExporter with refreshed credentials" + ) + _run_export(refreshed) + else: + logger.error( + "Discord export auth failed and credential refresh did not help. %s", + DISCORD_TOKENS_RELOGIN_HINT, + ) + raise + else: + raise except DiscordChatExporterError: raise except OSError as e: @@ -625,9 +723,8 @@ def export_guild_to_json( logger.exception("Unexpected error running DiscordChatExporter: %s", e) raise DiscordChatExporterError(f"Unexpected error: {e}") from e - json_files = _sorted_discord_export_json_paths(output_dir) - logger.info("Found %d exported JSON files", len(json_files)) - return json_files + logger.info("Found %d exported JSON files", len(export_results)) + return export_results def parse_exported_json(json_path: Path) -> Dict[str, Any]: @@ -751,18 +848,17 @@ def export_and_parse_guild( after_date: Optional[datetime] = None, ) -> List[Dict[str, Any]]: """Export guild via CLI and parse all resulting JSON files.""" - json_files = export_guild_to_json( + exports = export_guild_to_json( user_token=user_token, guild_id=guild_id, output_dir=output_dir, after_date=after_date, ) - json_files = filter_discord_export_json_paths(json_files) - parsed_channels = [] - for json_path in json_files: + for export in exports: + json_path = export.path try: data = parse_exported_json(json_path) diff --git a/discord_activity_tracker/sync/exporter_window.py b/discord_activity_tracker/sync/exporter_window.py index e27d8d87..0b00085f 100644 --- a/discord_activity_tracker/sync/exporter_window.py +++ b/discord_activity_tracker/sync/exporter_window.py @@ -2,7 +2,7 @@ from __future__ import annotations -from datetime import datetime +from datetime import datetime, timedelta, timezone from django.db.models import Max @@ -22,3 +22,135 @@ def latest_message_created_at_for_guild( if channel_ids: qs = qs.filter(channel__channel_id__in=channel_ids) return qs.aggregate(m=Max("message_created_at"))["m"] + + +def latest_message_created_at_for_channel( + guild_snowflake: int, + channel_snowflake: int, +) -> datetime | None: + """Latest ``message_created_at`` for one channel (non-deleted messages).""" + return DiscordMessage.objects.filter( + channel__server__server_id=guild_snowflake, + channel__channel_id=channel_snowflake, + is_deleted=False, + ).aggregate(m=Max("message_created_at"))["m"] + + +def incremental_export_after(latest: datetime) -> datetime: + """Lower bound for the next scheduled export with overlap. + + Returns UTC midnight on the calendar day of *latest* so the full day is + re-exported. Duplicate messages are merged by snowflake id; gaps are not. + """ + return utc_day_start(latest) + + +def resolve_channel_export_after( + guild_snowflake: int, + channel_snowflake: int, + *, + explicit_after: datetime | None, +) -> datetime | None: + """Per-channel ``--after`` for DiscordChatExporter. + + When *explicit_after* is set (``--since``), it applies to every channel. + Otherwise resumes from the UTC day start of that channel's latest stored + message, or ``None`` when the channel has no rows (today-only export). + """ + if explicit_after is not None: + return explicit_after + latest = latest_message_created_at_for_channel( + guild_snowflake, + channel_snowflake, + ) + if latest is None: + return None + return incremental_export_after(latest) + + +def utc_day_start(dt: datetime) -> datetime: + """UTC midnight for the calendar day containing *dt*.""" + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + else: + dt = dt.astimezone(timezone.utc) + return dt.replace(hour=0, minute=0, second=0, microsecond=0) + + +def iter_channel_export_days( + *, + after: datetime | None, + before: datetime | None, + now: datetime | None = None, +) -> list[tuple[str, datetime, datetime]]: + """Build per-day UTC export windows for DiscordChatExporter. + + Returns ``(YYYY-MM-DD, window_after, window_before)`` in chronological order. + + - When *after* is ``None`` (empty DB, no ``--since``): **today only** (UTC). + - Otherwise: from ``floor(after)`` through ``floor(before or now)`` inclusive. + - Each window is clipped to ``[max(day_start, after), min(day_end, before or now)]``. + - Skips days where the clipped window is empty (``after >= before``). + """ + if now is None: + now = datetime.now(timezone.utc) + elif now.tzinfo is None: + now = now.replace(tzinfo=timezone.utc) + else: + now = now.astimezone(timezone.utc) + + upper = now + if before is not None: + upper = ( + before.astimezone(timezone.utc) + if before.tzinfo is not None + else before.replace(tzinfo=timezone.utc) + ) + + if after is None: + first_day = utc_day_start(now) + last_day = first_day + else: + after_utc = ( + after.astimezone(timezone.utc) + if after.tzinfo is not None + else after.replace(tzinfo=timezone.utc) + ) + first_day = utc_day_start(after_utc) + last_day = utc_day_start(upper) + + after_utc: datetime | None = None + if after is not None: + after_utc = ( + after.astimezone(timezone.utc) + if after.tzinfo is not None + else after.replace(tzinfo=timezone.utc) + ) + + before_utc: datetime | None = None + if before is not None: + before_utc = ( + before.astimezone(timezone.utc) + if before.tzinfo is not None + else before.replace(tzinfo=timezone.utc) + ) + + result: list[tuple[str, datetime, datetime]] = [] + day = first_day + while day <= last_day: + day_end = day + timedelta(days=1) + window_after = day + window_before = day_end + + if after_utc is not None and day == first_day: + window_after = max(day, after_utc) + if before_utc is not None and day == last_day: + window_before = min(day_end, before_utc) + elif before is None and day == last_day: + window_before = min(day_end, now) + + if window_after < window_before: + result.append((day.strftime("%Y-%m-%d"), window_after, window_before)) + day += timedelta(days=1) + + return result diff --git a/discord_activity_tracker/sync/raw_archive.py b/discord_activity_tracker/sync/raw_archive.py new file mode 100644 index 00000000..13fa58a5 --- /dev/null +++ b/discord_activity_tracker/sync/raw_archive.py @@ -0,0 +1,136 @@ +"""Merge DiscordChatExporter JSON into per-day raw archives.""" + +from __future__ import annotations + +import json +import logging +import os +import tempfile +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from core.utils.datetime_parsing import parse_iso_datetime_lenient + +logger = logging.getLogger(__name__) + + +def message_utc_date_str(msg: dict[str, Any]) -> str | None: + """Return ``YYYY-MM-DD`` (UTC) for an exporter message dict, or ``None`` if unparseable.""" + raw_ts = msg.get("timestamp") + if not raw_ts: + return None + dt = parse_iso_datetime_lenient(str(raw_ts)) + if dt is None: + return None + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + else: + dt = dt.astimezone(timezone.utc) + return dt.strftime("%Y-%m-%d") + + +def _message_sort_key(msg: dict[str, Any]) -> tuple[str, str]: + ts = str(msg.get("timestamp") or "") + mid = str(msg.get("id") or "") + return (ts, mid) + + +def _filter_messages_for_day( + messages: list[dict[str, Any]], day: str +) -> list[dict[str, Any]]: + return [m for m in messages if message_utc_date_str(m) == day] + + +def _merge_message_lists( + existing: list[dict[str, Any]], + incoming: list[dict[str, Any]], +) -> list[dict[str, Any]]: + by_id: dict[str, dict[str, Any]] = {} + for msg in existing: + mid = str(msg.get("id", "")) + if mid: + by_id[mid] = msg + for msg in incoming: + mid = str(msg.get("id", "")) + if mid: + by_id[mid] = msg + return sorted(by_id.values(), key=_message_sort_key) + + +def _refresh_envelope_metadata(merged: dict[str, Any]) -> None: + messages: list[dict[str, Any]] = merged.get("messages") or [] + now_iso = datetime.now(timezone.utc).isoformat() + merged["exportedAt"] = now_iso + + if not messages: + date_range = merged.setdefault("dateRange", {}) + if not isinstance(date_range, dict): + merged["dateRange"] = date_range = {} + return + + timestamps = [] + for msg in messages: + dt = parse_iso_datetime_lenient(str(msg.get("timestamp") or "")) + if dt is not None: + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + else: + dt = dt.astimezone(timezone.utc) + timestamps.append(dt) + + if timestamps: + earliest = min(timestamps) + latest = max(timestamps) + merged["dateRange"] = { + "after": earliest.isoformat(), + "before": latest.isoformat(), + } + + +def merge_exporter_json(dest: Path, incoming: dict[str, Any], *, day: str) -> int: + """Merge *incoming* exporter JSON into *dest* for UTC calendar day *day*. + + Messages are keyed by snowflake ``id``; incoming overwrites existing entries. + Only messages on *day* (UTC) are kept in the archive. + + Returns the number of messages written to the merged file. + """ + incoming_msgs = _filter_messages_for_day(incoming.get("messages") or [], day) + + if dest.is_file(): + with open(dest, "r", encoding="utf-8") as f: + existing = json.load(f) + existing_msgs = _filter_messages_for_day(existing.get("messages") or [], day) + merged_msgs = _merge_message_lists(existing_msgs, incoming_msgs) + merged = dict(existing) + merged["guild"] = incoming.get("guild") or existing.get("guild") or {} + merged["channel"] = incoming.get("channel") or existing.get("channel") or {} + else: + merged_msgs = _merge_message_lists([], incoming_msgs) + merged = { + "guild": incoming.get("guild") or {}, + "channel": incoming.get("channel") or {}, + } + + merged["messages"] = merged_msgs + _refresh_envelope_metadata(merged) + + dest.parent.mkdir(parents=True, exist_ok=True) + fd, tmp_path = tempfile.mkstemp( + suffix=".json", dir=dest.parent, prefix=f".{dest.stem}." + ) + try: + with os.fdopen(fd, "w", encoding="utf-8") as f: + json.dump(merged, f, ensure_ascii=False, indent=2) + f.write("\n") + os.replace(tmp_path, dest) + except Exception: + try: + os.unlink(tmp_path) + except OSError: + pass + raise + + logger.debug("Merged %d message(s) into %s", len(merged_msgs), dest) + return len(merged_msgs) diff --git a/discord_activity_tracker/tests/test_chat_exporter_branch_coverage.py b/discord_activity_tracker/tests/test_chat_exporter_branch_coverage.py index ee90133d..50144731 100644 --- a/discord_activity_tracker/tests/test_chat_exporter_branch_coverage.py +++ b/discord_activity_tracker/tests/test_chat_exporter_branch_coverage.py @@ -126,11 +126,7 @@ def test_export_guild_os_error_errno_8_wraps(tmp_path, monkeypatch): "discord_activity_tracker.sync.chat_exporter.validate_discord_chat_exporter_cli_architecture", ), patch( - "discord_activity_tracker.sync.chat_exporter._get_sequential_export", - return_value=False, - ), - patch( - "discord_activity_tracker.sync.chat_exporter._export_guild_exportguild", + "discord_activity_tracker.sync.chat_exporter._export_guild_by_channel_day", side_effect=err, ), ): diff --git a/discord_activity_tracker/tests/test_discord_internal_tokens_store.py b/discord_activity_tracker/tests/test_discord_internal_tokens_store.py new file mode 100644 index 00000000..d91a2578 --- /dev/null +++ b/discord_activity_tracker/tests/test_discord_internal_tokens_store.py @@ -0,0 +1,126 @@ +"""Tests for workspace JSON Discord internal token storage.""" + +import json +import logging +from unittest.mock import patch + +import pytest +from django.test import override_settings + +from discord_activity_tracker.utils import discord_internal_tokens_store as store + + +@override_settings( + WORKSPACE_DIR="/tmp/ws", + DISCORD_INTERNAL_TOKENS_JSON="", +) +def test_save_and_load_tokens(tmp_path, settings): + settings.WORKSPACE_DIR = str(tmp_path) + path = store.save_discord_internal_tokens( + "discord-tok", user_id="123", username="alice" + ) + assert ( + path == tmp_path / "discord_activity_tracker" / "discord_internal_tokens.json" + ) + data = json.loads(path.read_text(encoding="utf-8")) + assert data["user_token"] == "discord-tok" + assert data["user_id"] == "123" + loaded = store.load_discord_internal_tokens() + assert loaded["user_token"] == "discord-tok" + assert loaded["username"] == "alice" + + +@override_settings(ALLOW_INTERNAL_DISCORD_TOKENS=True, WORKSPACE_DIR="/tmp/ws") +def test_get_discord_user_token_from_json(tmp_path, settings): + settings.WORKSPACE_DIR = str(tmp_path) + store.save_discord_internal_tokens("tok") + assert store.get_discord_user_token_from_json() == "tok" + + +@override_settings(ALLOW_INTERNAL_DISCORD_TOKENS=False, WORKSPACE_DIR="/tmp/ws") +def test_get_token_from_json_disabled(tmp_path, settings): + settings.WORKSPACE_DIR = str(tmp_path) + store.save_discord_internal_tokens("tok") + assert store.get_discord_user_token_from_json() is None + + +def test_save_requires_token(): + with pytest.raises(ValueError): + store.save_discord_internal_tokens("") + + +@override_settings(ALLOW_INTERNAL_DISCORD_TOKENS=False, DISCORD_USER_TOKEN="env-tok") +def test_get_or_load_uses_env_when_internal_disabled(): + assert store.get_or_load_discord_user_token() == "env-tok" + + +@override_settings(ALLOW_INTERNAL_DISCORD_TOKENS=True, WORKSPACE_DIR="/tmp/ws") +@patch( + "discord_activity_tracker.utils.discord_tokens.probe_discord_user_token", + return_value=True, +) +@patch( + "discord_activity_tracker.utils.discord_internal_tokens_store.extract_and_save_discord_internal_tokens", + return_value="fresh-tok", +) +def test_get_or_load_extracts_when_json_missing( + mock_extract, _mock_probe, tmp_path, settings +): + settings.WORKSPACE_DIR = str(tmp_path) + token = store.get_or_load_discord_user_token() + assert token == "fresh-tok" + mock_extract.assert_called_once() + + +@override_settings(ALLOW_INTERNAL_DISCORD_TOKENS=True, WORKSPACE_DIR="/tmp/ws") +@patch( + "discord_activity_tracker.utils.discord_tokens.probe_discord_user_token", + side_effect=[False, True], +) +@patch( + "discord_activity_tracker.utils.discord_internal_tokens_store.extract_and_save_discord_internal_tokens", + return_value="new-tok", +) +def test_get_or_load_reextracts_when_json_tokens_stale( + mock_extract, _mock_probe, tmp_path, settings +): + settings.WORKSPACE_DIR = str(tmp_path) + store.save_discord_internal_tokens("old-tok") + token = store.get_or_load_discord_user_token() + assert token == "new-tok" + mock_extract.assert_called_once() + + +@override_settings(ALLOW_INTERNAL_DISCORD_TOKENS=True, WORKSPACE_DIR="/tmp/ws") +@patch( + "discord_activity_tracker.utils.discord_tokens.probe_discord_user_token", + return_value=False, +) +@patch( + "discord_activity_tracker.utils.discord_internal_tokens_store.extract_and_save_discord_internal_tokens", + return_value="bad-tok", +) +def test_get_or_load_logs_when_reextracted_tokens_still_invalid( + mock_extract, _mock_probe, tmp_path, settings, caplog +): + settings.WORKSPACE_DIR = str(tmp_path) + store.save_discord_internal_tokens("old-tok") + with caplog.at_level(logging.ERROR): + token = store.get_or_load_discord_user_token() + assert token is None + mock_extract.assert_called_once() + assert "still invalid" in caplog.text + assert ".env.example" in caplog.text + + +@override_settings(ALLOW_INTERNAL_DISCORD_TOKENS=True, WORKSPACE_DIR="/tmp/ws") +@patch( + "discord_activity_tracker.utils.discord_tokens.probe_discord_user_token", + return_value=True, +) +def test_get_or_load_keeps_valid_json_tokens(_mock_probe, tmp_path, settings): + settings.WORKSPACE_DIR = str(tmp_path) + store.save_discord_internal_tokens("tok") + token = store.get_or_load_discord_user_token() + assert token == "tok" + _mock_probe.assert_called_once_with("tok") diff --git a/discord_activity_tracker/tests/test_discord_tokens.py b/discord_activity_tracker/tests/test_discord_tokens.py new file mode 100644 index 00000000..c32418ff --- /dev/null +++ b/discord_activity_tracker/tests/test_discord_tokens.py @@ -0,0 +1,150 @@ +"""Tests for discord_activity_tracker.utils.discord_tokens (no real Chrome profile).""" + +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +from django.test import override_settings + +from discord_activity_tracker.utils import discord_tokens as dt + + +def test_parse_discord_token_raw_strips_prefix_and_quotes(): + raw = b'\x01"my-discord-token"' + assert dt._parse_discord_token_raw(raw) == "my-discord-token" + + +def test_parse_discord_token_raw_plain(): + assert dt._parse_discord_token_raw(b"plain-token") == "plain-token" + + +def test_parse_discord_token_raw_empty_raises(): + with pytest.raises(ValueError): + dt._parse_discord_token_raw(b"") + + +@patch("discord_activity_tracker.utils.discord_tokens.requests.get") +def test_probe_discord_user_token_ok(mock_get): + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_get.return_value = mock_resp + assert dt.probe_discord_user_token("tok") is True + + +@patch("discord_activity_tracker.utils.discord_tokens.requests.get") +def test_probe_discord_user_token_auth_error(mock_get): + mock_resp = MagicMock() + mock_resp.status_code = 401 + mock_get.return_value = mock_resp + assert dt.probe_discord_user_token("tok") is False + + +def test_probe_discord_user_token_empty(): + assert dt.probe_discord_user_token("") is False + + +@patch("discord_activity_tracker.utils.discord_tokens.requests.get") +def test_probe_discord_user_token_details(mock_get): + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = {"id": "123", "username": "alice"} + mock_get.return_value = mock_resp + out = dt.probe_discord_user_token_details("tok") + assert out == {"user_id": "123", "username": "alice"} + + +def test_is_discord_exporter_auth_error(): + assert dt.is_discord_exporter_auth_error("HTTP 401 Unauthorized") + assert dt.is_discord_exporter_auth_error("invalid token") + assert not dt.is_discord_exporter_auth_error("channel not found") + + +@override_settings(DISCORD_CHROME_PROFILE_PATH="", WORKSPACE_DIR="/tmp/ws") +def test_resolve_discord_chrome_profile_uses_workspace_default(tmp_path, settings): + settings.WORKSPACE_DIR = str(tmp_path) + expected = tmp_path / "discord_activity_tracker" / "chrome_profile" + expected.mkdir(parents=True) + assert dt._resolve_discord_chrome_profile_root() == expected.resolve() + + +def test_resolve_discord_chrome_profile_respects_custom_path(tmp_path): + custom = tmp_path / "custom_discord_chrome" + custom.mkdir() + with override_settings( + DISCORD_CHROME_PROFILE_PATH=str(custom), WORKSPACE_DIR="/tmp/ws" + ): + assert dt._resolve_discord_chrome_profile_root() == custom.resolve() + + +@pytest.mark.parametrize("bad", ["", None, "bad\x00path", "???"]) +def test_validate_chrome_profile_path_bad(bad): + with pytest.raises(ValueError): + dt._validate_chrome_profile_path(bad) + + +def test_read_discord_token_from_leveldb_parses(tmp_path): + profile = tmp_path / "chrome_profile" + leveldb_dir = profile / "Default" / "Local Storage" / "leveldb" + leveldb_dir.mkdir(parents=True) + with patch.object( + dt, + "_read_leveldb_value", + return_value=b'\x01"token-from-leveldb"', + ): + assert dt._read_discord_token_from_leveldb(profile) == "token-from-leveldb" + + +def test_read_discord_token_from_leveldb_returns_none_when_no_leveldb(tmp_path): + profile = tmp_path / "empty_profile" + profile.mkdir() + assert dt._read_discord_token_from_leveldb(profile) is None + + +def test_read_discord_token_from_leveldb_falls_back_to_legacy_key(tmp_path): + profile = tmp_path / "chrome_profile" + leveldb_dir = profile / "Default" / "Local Storage" / "leveldb" + leveldb_dir.mkdir(parents=True) + + def read_side_effect(_leveldb_dir, key): + if key == dt.DISCORD_TOKEN_KEY: + return b'\x01""' + return b'\x01"legacy-token"' + + with patch.object(dt, "_read_leveldb_value", side_effect=read_side_effect): + assert dt._read_discord_token_from_leveldb(profile) == "legacy-token" + + +@patch.object(dt, "probe_discord_user_token", return_value=True) +@patch.object(dt, "probe_discord_user_token_details", return_value={"user_id": "1"}) +@patch.object(dt, "_read_discord_token_from_leveldb", return_value="tok") +@patch.object(dt, "_resolve_discord_chrome_profile_root") +def test_extract_discord_token_auto_success( + mock_resolve, mock_read, _mock_details, _mock_probe, tmp_path, settings +): + profile = tmp_path / "profile" + profile.mkdir() + settings.DISCORD_CHROME_PROFILE_PATH = str(profile) + mock_resolve.return_value = profile + out = dt.extract_discord_token_auto() + assert out["user_token"] == "tok" + assert out["user_id"] == "1" + + +@patch.object(dt, "_resolve_discord_chrome_profile_root") +def test_extract_discord_token_auto_missing_profile(mock_resolve, settings): + settings.DISCORD_CHROME_PROFILE_PATH = "/nonexistent/profile/path" + mock_resolve.return_value = Path("/nonexistent/profile/path") + assert dt.extract_discord_token_auto() is None + + +@patch.object(dt, "probe_discord_user_token", return_value=False) +@patch.object(dt, "_read_discord_token_from_leveldb", return_value="bad-tok") +@patch.object(dt, "_resolve_discord_chrome_profile_root") +def test_extract_discord_token_auto_probe_fails( + mock_resolve, _mock_read, _mock_probe, tmp_path, settings +): + profile = tmp_path / "profile" + profile.mkdir() + settings.DISCORD_CHROME_PROFILE_PATH = str(profile) + mock_resolve.return_value = profile + assert dt.extract_discord_token_auto() is None diff --git a/discord_activity_tracker/tests/test_exporter_window.py b/discord_activity_tracker/tests/test_exporter_window.py index b7f4c91d..2c1c4428 100644 --- a/discord_activity_tracker/tests/test_exporter_window.py +++ b/discord_activity_tracker/tests/test_exporter_window.py @@ -14,7 +14,12 @@ DiscordServer, ) from discord_activity_tracker.sync.exporter_window import ( + incremental_export_after, + iter_channel_export_days, + latest_message_created_at_for_channel, latest_message_created_at_for_guild, + resolve_channel_export_after, + utc_day_start, ) @@ -88,3 +93,127 @@ def test_latest_message_respects_channel_allowlist(): srv.server_id, channel_ids=[ch1.channel_id] ) assert latest == t1 + + +def test_utc_day_start_normalizes_to_midnight(): + dt = datetime(2026, 6, 2, 22, 30, 45, tzinfo=timezone.utc) + assert utc_day_start(dt) == datetime(2026, 6, 2, 0, 0, 0, tzinfo=timezone.utc) + + +def test_iter_channel_export_days_empty_after_is_today_only(): + now = datetime(2026, 6, 11, 15, 0, 0, tzinfo=timezone.utc) + days = iter_channel_export_days(after=None, before=None, now=now) + assert len(days) == 1 + assert days[0][0] == "2026-06-11" + assert days[0][1] == datetime(2026, 6, 11, 0, 0, 0, tzinfo=timezone.utc) + assert days[0][2] == now + + +def test_iter_channel_export_days_spans_multiple_days(): + after = datetime(2026, 6, 1, 10, 0, 0, tzinfo=timezone.utc) + before = datetime(2026, 6, 3, 8, 0, 0, tzinfo=timezone.utc) + days = iter_channel_export_days(after=after, before=before, now=before) + assert [d[0] for d in days] == ["2026-06-01", "2026-06-02", "2026-06-03"] + assert days[0][1] == after + assert days[-1][2] == before + + +@pytest.mark.django_db +def test_latest_message_per_channel(): + srv = DiscordServer.objects.create(server_id=_uid(), server_name="G", icon_url="") + ch1 = DiscordChannel.objects.create( + server=srv, channel_id=_uid(), channel_name="a", channel_type="text" + ) + ch2 = DiscordChannel.objects.create( + server=srv, channel_id=_uid(), channel_name="b", channel_type="text" + ) + author = DiscordProfile.objects.create( + discord_user_id=_uid(), + username="u", + display_name="U", + avatar_url="", + is_bot=False, + ) + t1 = datetime(2026, 4, 1, 15, 0, 0, tzinfo=timezone.utc) + t2 = datetime(2026, 5, 1, 9, 0, 0, tzinfo=timezone.utc) + DiscordMessage.objects.create( + message_id=_uid(), + channel=ch1, + author=author, + content="a", + message_created_at=t1, + ) + DiscordMessage.objects.create( + message_id=_uid(), + channel=ch2, + author=author, + content="b", + message_created_at=t2, + ) + assert latest_message_created_at_for_channel(srv.server_id, ch1.channel_id) == t1 + assert latest_message_created_at_for_channel(srv.server_id, ch2.channel_id) == t2 + + +def test_incremental_export_after_floors_to_utc_day_start(): + latest = datetime(2026, 6, 10, 22, 45, 0, tzinfo=timezone.utc) + assert incremental_export_after(latest) == datetime( + 2026, 6, 10, 0, 0, 0, tzinfo=timezone.utc + ) + + +@pytest.mark.django_db +def test_resolve_channel_export_after_uses_day_start_without_explicit_since(): + srv = DiscordServer.objects.create(server_id=_uid(), server_name="G", icon_url="") + ch = DiscordChannel.objects.create( + server=srv, channel_id=_uid(), channel_name="c", channel_type="text" + ) + author = DiscordProfile.objects.create( + discord_user_id=_uid(), + username="u", + display_name="U", + avatar_url="", + is_bot=False, + ) + latest = datetime(2026, 6, 10, 18, 0, 0, tzinfo=timezone.utc) + DiscordMessage.objects.create( + message_id=_uid(), + channel=ch, + author=author, + content="msg", + message_created_at=latest, + ) + resolved = resolve_channel_export_after( + srv.server_id, + ch.channel_id, + explicit_after=None, + ) + assert resolved == datetime(2026, 6, 10, 0, 0, 0, tzinfo=timezone.utc) + + +def test_resolve_channel_export_after_honors_explicit_since(): + explicit = datetime(2026, 1, 1, tzinfo=timezone.utc) + assert resolve_channel_export_after(1, 2, explicit_after=explicit) == explicit + + +def test_iter_channel_export_days_naive_before_treated_as_utc(): + after = datetime(2026, 6, 1, 10, 0, 0, tzinfo=timezone.utc) + before_naive = datetime(2026, 6, 3, 8, 0, 0) + before_aware = datetime(2026, 6, 3, 8, 0, 0, tzinfo=timezone.utc) + naive_days = iter_channel_export_days( + after=after, before=before_naive, now=before_aware + ) + aware_days = iter_channel_export_days( + after=after, before=before_aware, now=before_aware + ) + assert naive_days == aware_days + assert [d[0] for d in naive_days] == ["2026-06-01", "2026-06-02", "2026-06-03"] + + +def test_iter_channel_export_days_clips_partial_last_day(): + after = datetime(2026, 6, 2, 22, 0, 0, tzinfo=timezone.utc) + now = datetime(2026, 6, 2, 23, 30, 0, tzinfo=timezone.utc) + days = iter_channel_export_days(after=after, before=None, now=now) + assert len(days) == 1 + assert days[0][0] == "2026-06-02" + assert days[0][1] == after + assert days[0][2] == now diff --git a/discord_activity_tracker/tests/test_extract_discord_tokens_command.py b/discord_activity_tracker/tests/test_extract_discord_tokens_command.py new file mode 100644 index 00000000..8253c719 --- /dev/null +++ b/discord_activity_tracker/tests/test_extract_discord_tokens_command.py @@ -0,0 +1,51 @@ +"""Tests for extract_discord_tokens management command.""" + +from io import StringIO +from unittest.mock import patch + +import pytest +from django.core.management import call_command +from django.core.management.base import CommandError + + +@patch( + "discord_activity_tracker.management.commands.extract_discord_tokens.extract_and_save_discord_internal_tokens", + return_value="discord-tok", +) +@patch( + "discord_activity_tracker.management.commands.extract_discord_tokens._resolve_discord_chrome_profile_root", +) +def test_extract_discord_tokens_command_success( + mock_resolve_profile, mock_extract_and_save, tmp_path +): + profile = tmp_path / "chrome_profile" + profile.mkdir() + mock_resolve_profile.return_value = profile + out = StringIO() + call_command("extract_discord_tokens", stdout=out) + mock_extract_and_save.assert_called_once() + assert "Saved Discord session credentials" in out.getvalue() + + +@patch( + "discord_activity_tracker.management.commands.extract_discord_tokens.extract_and_save_discord_internal_tokens", + return_value=None, +) +@patch( + "discord_activity_tracker.management.commands.extract_discord_tokens._resolve_discord_chrome_profile_root", +) +def test_extract_discord_tokens_command_failure( + mock_resolve_profile, mock_extract_and_save, tmp_path +): + profile = tmp_path / "chrome_profile" + profile.mkdir() + mock_resolve_profile.return_value = profile + with pytest.raises(CommandError, match="Failed to load session credentials"): + call_command("extract_discord_tokens") + mock_extract_and_save.assert_called_once() + + +def test_extract_discord_tokens_command_missing_profile(settings, tmp_path): + settings.DISCORD_CHROME_PROFILE_PATH = str(tmp_path / "missing_profile") + with pytest.raises(CommandError, match="Session storage not found"): + call_command("extract_discord_tokens") diff --git a/discord_activity_tracker/tests/test_raw_archive.py b/discord_activity_tracker/tests/test_raw_archive.py new file mode 100644 index 00000000..b58e33a9 --- /dev/null +++ b/discord_activity_tracker/tests/test_raw_archive.py @@ -0,0 +1,95 @@ +"""Tests for sync/raw_archive.py.""" + +from __future__ import annotations + +import json +from pathlib import Path + +from discord_activity_tracker.sync.raw_archive import ( + merge_exporter_json, + message_utc_date_str, +) + + +def _msg(mid: str, ts: str) -> dict: + return {"id": mid, "timestamp": ts, "content": f"msg-{mid}"} + + +def _envelope(*messages: dict) -> dict: + return { + "guild": {"id": "1", "name": "G"}, + "channel": {"id": "2", "name": "c"}, + "messages": list(messages), + } + + +def test_message_utc_date_str_parses_offset(): + assert message_utc_date_str(_msg("1", "2026-06-02T22:00:00+00:00")) == "2026-06-02" + + +def test_merge_exporter_json_creates_new_file(tmp_path: Path): + dest = tmp_path / "2026-06-02.json" + incoming = _envelope(_msg("100", "2026-06-02T10:00:00Z")) + count = merge_exporter_json(dest, incoming, day="2026-06-02") + assert count == 1 + data = json.loads(dest.read_text(encoding="utf-8")) + assert len(data["messages"]) == 1 + assert data["messages"][0]["id"] == "100" + + +def test_merge_exporter_json_appends_new_message_same_day(tmp_path: Path): + dest = tmp_path / "2026-06-02.json" + first = _envelope(_msg("100", "2026-06-02T10:00:00Z")) + merge_exporter_json(dest, first, day="2026-06-02") + second = _envelope(_msg("101", "2026-06-02T23:00:00Z")) + count = merge_exporter_json(dest, second, day="2026-06-02") + assert count == 2 + data = json.loads(dest.read_text(encoding="utf-8")) + ids = [m["id"] for m in data["messages"]] + assert ids == ["100", "101"] + + +def test_merge_exporter_json_updates_same_id(tmp_path: Path): + dest = tmp_path / "2026-06-02.json" + merge_exporter_json( + dest, + _envelope(_msg("100", "2026-06-02T10:00:00Z")), + day="2026-06-02", + ) + merge_exporter_json( + dest, + _envelope({**_msg("100", "2026-06-02T10:00:00Z"), "content": "edited"}), + day="2026-06-02", + ) + data = json.loads(dest.read_text(encoding="utf-8")) + assert len(data["messages"]) == 1 + assert data["messages"][0]["content"] == "edited" + + +def test_merge_exporter_json_filters_wrong_day(tmp_path: Path): + dest = tmp_path / "2026-06-02.json" + incoming = _envelope( + _msg("100", "2026-06-02T10:00:00Z"), + _msg("200", "2026-06-03T01:00:00Z"), + ) + count = merge_exporter_json(dest, incoming, day="2026-06-02") + assert count == 1 + data = json.loads(dest.read_text(encoding="utf-8")) + assert [m["id"] for m in data["messages"]] == ["100"] + + +def test_merge_exporter_json_refreshes_date_range(tmp_path: Path): + dest = tmp_path / "2026-06-02.json" + merge_exporter_json( + dest, + _envelope( + _msg("100", "2026-06-02T10:00:00Z"), + _msg("101", "2026-06-02T23:00:00Z"), + ), + day="2026-06-02", + ) + data = json.loads(dest.read_text(encoding="utf-8")) + assert "dateRange" in data + assert data["dateRange"]["after"].startswith("2026-06-02") + assert data["dateRange"]["before"].startswith("2026-06-02") + assert "exportedAt" in data diff --git a/discord_activity_tracker/tests/test_run_command_coverage.py b/discord_activity_tracker/tests/test_run_command_coverage.py index 755a1095..f6f5ec71 100644 --- a/discord_activity_tracker/tests/test_run_command_coverage.py +++ b/discord_activity_tracker/tests/test_run_command_coverage.py @@ -43,7 +43,7 @@ def _cmd_collector(**opts): def test_resolve_bounds_since_after_until_resets(monkeypatch, caplog): """since > until logs warning and falls back so bounds are recomputed.""" caplog.set_level("WARNING") - after, before = _resolve_exporter_date_bounds( + after, before, _per_ch = _resolve_exporter_date_bounds( {"since": "2026-06-10", "until": "2026-06-01"}, guild_snowflake=1, channel_ids=[], @@ -102,7 +102,7 @@ def test_handle_core_dry_run_skip_sync_only(monkeypatch, settings): ): collector.cmd._handle_core(collector.options, collector) out = collector.stdout.getvalue() - assert "full history" in out or "none" in out.lower() + assert "today" in out.lower() @pytest.mark.django_db @@ -223,7 +223,7 @@ def test_task_preprocess_workspace_dry_run(tmp_path, settings): def test_resolve_bounds_since_naive_becomes_utc(): - after, before = _resolve_exporter_date_bounds( + after, before, _per_ch = _resolve_exporter_date_bounds( {"since": "2026-04-01T00:00:00", "until": None}, guild_snowflake=1, channel_ids=[], diff --git a/discord_activity_tracker/tests/test_run_discord_activity_tracker_command.py b/discord_activity_tracker/tests/test_run_discord_activity_tracker_command.py index 2d38ceec..73eb8a3f 100644 --- a/discord_activity_tracker/tests/test_run_discord_activity_tracker_command.py +++ b/discord_activity_tracker/tests/test_run_discord_activity_tracker_command.py @@ -72,17 +72,18 @@ def test_parse_channel_ids_empty_string(): @pytest.mark.django_db def test_resolve_bounds_no_since_empty_db_after_is_none(settings): settings.USE_TZ = True - after, before = _resolve_exporter_date_bounds( + after, before, per_ch = _resolve_exporter_date_bounds( {"since": None, "until": None}, guild_snowflake=888001, channel_ids=[], ) assert before is None assert after is None + assert per_ch is True def test_resolve_bounds_since_until_only(): - after, before = _resolve_exporter_date_bounds( + after, before, per_ch = _resolve_exporter_date_bounds( { "since": "2026-01-01", "until": "2026-01-31", @@ -91,16 +92,18 @@ def test_resolve_bounds_since_until_only(): channel_ids=[], ) assert after is not None and before is not None + assert per_ch is False def test_resolve_bounds_explicit_since_no_until(): - after, before = _resolve_exporter_date_bounds( + after, before, per_ch = _resolve_exporter_date_bounds( {"since": "2026-05-01", "until": None}, guild_snowflake=1, channel_ids=[], ) assert after is not None assert before is None + assert per_ch is False @pytest.mark.django_db @@ -137,13 +140,14 @@ def test_resolve_bounds_no_since_uses_latest_db_message(): message_created_at=msg_time, ) - after, before = _resolve_exporter_date_bounds( + after, before, per_ch = _resolve_exporter_date_bounds( {"since": None, "until": None}, guild_snowflake=700, channel_ids=[701], ) assert before is None assert after == msg_time + assert per_ch is True # --------------------------------------------------------------------------- @@ -177,9 +181,10 @@ def test_collector_empty_channels_arg_falls_back_to_settings(monkeypatch): @pytest.mark.django_db def test_handle_core_raises_when_user_token_missing(monkeypatch): monkeypatch.setattr(settings, "DISCORD_USER_TOKEN", "") + monkeypatch.setattr(settings, "ALLOW_INTERNAL_DISCORD_TOKENS", False) monkeypatch.setattr(settings, "DISCORD_SERVER_ID", 9999) cmd, collector = _cmd_and_collector() - with pytest.raises(CommandError, match="DISCORD_USER_TOKEN"): + with pytest.raises(CommandError, match="Discord credentials not configured"): cmd._handle_core(collector.options, collector=collector) diff --git a/discord_activity_tracker/tests/test_sync_chat_exporter.py b/discord_activity_tracker/tests/test_sync_chat_exporter.py index 8681bcad..93dea194 100644 --- a/discord_activity_tracker/tests/test_sync_chat_exporter.py +++ b/discord_activity_tracker/tests/test_sync_chat_exporter.py @@ -8,6 +8,7 @@ import pytest from discord_activity_tracker.sync.chat_exporter import ( + ChannelDayExport, DiscordChatExporterError, _sorted_discord_export_json_paths, filter_discord_export_json_paths, @@ -92,6 +93,10 @@ def test_export_guild_success(tmp_path): cli = tmp_path / "DiscordChatExporter.Cli.exe" cli.write_text("fake", encoding="utf-8") out = tmp_path / "exp" + channel_id = 100 + day_str = "2026-01-02" + after = datetime(2026, 1, 2, tzinfo=timezone.utc) + before = datetime(2026, 1, 3, tzinfo=timezone.utc) proc = MagicMock() proc.stdout = StringIO("line1\n\n") @@ -100,7 +105,7 @@ def test_export_guild_success(tmp_path): def wait(): proc.returncode = 0 out.mkdir(parents=True, exist_ok=True) - (out / "guild.json").write_text("{}", encoding="utf-8") + (out / f"{channel_id}_{day_str}.json").write_text("{}", encoding="utf-8") proc.wait = wait @@ -114,9 +119,19 @@ def wait(): return_value=proc, ), ): - paths = export_guild_to_json("user-token", 42, out, include_threads="All") + exports = export_guild_to_json( + "user-token", + 42, + out, + channel_ids=[channel_id], + after_date=after, + before_date=before, + include_threads="All", + ) - assert out / "guild.json" in paths + assert len(exports) == 1 + assert exports[0].path == out / f"{channel_id}_{day_str}.json" + assert exports[0].day_str == day_str def test_export_guild_nonzero_exit_raises(tmp_path): @@ -144,12 +159,41 @@ def wait(): ), ): with pytest.raises(DiscordChatExporterError, match="exit code"): - export_guild_to_json("tok", 1, out) - - -def test_export_guild_unexpected_wraps(tmp_path): + export_guild_to_json( + "tok", + 1, + out, + channel_ids=[1], + after_date=datetime(2026, 1, 2, tzinfo=timezone.utc), + before_date=datetime(2026, 1, 3, tzinfo=timezone.utc), + ) + + +def test_export_guild_auth_failure_retries_with_reextracted_token(tmp_path, settings): + settings.ALLOW_INTERNAL_DISCORD_TOKENS = True cli = tmp_path / "DiscordChatExporter.Cli.exe" cli.write_text("fake", encoding="utf-8") + out = tmp_path / "exp" + + call_count = {"n": 0} + + def make_proc(): + proc = MagicMock() + proc.stdout = StringIO("line1\n") + if call_count["n"] == 0: + proc.stderr.read.return_value = "HTTP 401 Unauthorized" + proc.wait = lambda: setattr(proc, "returncode", 1) or None + else: + proc.stderr.read.return_value = "" + + def wait_ok(): + proc.returncode = 0 + out.mkdir(parents=True, exist_ok=True) + (out / "100_2026-01-02.json").write_text("{}", encoding="utf-8") + + proc.wait = wait_ok + call_count["n"] += 1 + return proc with ( patch( @@ -158,34 +202,29 @@ def test_export_guild_unexpected_wraps(tmp_path): ), patch( "discord_activity_tracker.sync.chat_exporter.subprocess.Popen", - side_effect=OSError("bad"), + side_effect=lambda *a, **k: make_proc(), + ), + patch( + "discord_activity_tracker.utils.discord_internal_tokens_store.extract_and_save_discord_internal_tokens", + return_value="fresh-tok", ), ): - with pytest.raises(DiscordChatExporterError, match="Unexpected"): - export_guild_to_json("tok", 1, tmp_path / "o") + exports = export_guild_to_json( + "old-tok", + 42, + out, + channel_ids=[100], + after_date=datetime(2026, 1, 2, tzinfo=timezone.utc), + before_date=datetime(2026, 1, 3, tzinfo=timezone.utc), + ) + assert call_count["n"] == 2 + assert exports[0].path == out / "100_2026-01-02.json" -def test_export_guild_output_path_uses_os_sep(tmp_path): - """The --output arg must end with os.sep (not hardcoded backslash).""" - import os +def test_export_guild_unexpected_wraps(tmp_path): cli = tmp_path / "DiscordChatExporter.Cli.exe" cli.write_text("fake", encoding="utf-8") - out = tmp_path / "exp" - captured = {} - - proc = MagicMock() - proc.stdout = StringIO("") - proc.stderr.read.return_value = "" - - def wait(): - proc.returncode = 0 - - proc.wait = wait - - def popen(cmd, **_kwargs): - captured["cmd"] = cmd - return proc with ( patch( @@ -194,20 +233,22 @@ def popen(cmd, **_kwargs): ), patch( "discord_activity_tracker.sync.chat_exporter.subprocess.Popen", - side_effect=popen, + side_effect=OSError("bad"), ), ): - export_guild_to_json("tok", 1, out) - - output_index = captured["cmd"].index("--output") + 1 - output_value = captured["cmd"][output_index] - assert output_value.endswith( - os.sep - ), f"--output should end with os.sep='{os.sep}', got: {output_value!r}" - - -def test_export_guild_parallel_from_settings(tmp_path, settings): - settings.DISCORD_CHAT_EXPORTER_PARALLEL = 4 + with pytest.raises(DiscordChatExporterError, match="Unexpected"): + export_guild_to_json( + "tok", + 1, + tmp_path / "o", + channel_ids=[1], + after_date=datetime(2026, 1, 2, tzinfo=timezone.utc), + before_date=datetime(2026, 1, 3, tzinfo=timezone.utc), + ) + + +def test_export_guild_output_path_is_explicit_json_file(tmp_path): + """Per-day export passes an explicit ``-o`` JSON file path (no directory slash).""" cli = tmp_path / "DiscordChatExporter.Cli.exe" cli.write_text("fake", encoding="utf-8") out = tmp_path / "exp" @@ -219,6 +260,8 @@ def test_export_guild_parallel_from_settings(tmp_path, settings): def wait(): proc.returncode = 0 + out.mkdir(parents=True, exist_ok=True) + (out / "1_2026-01-02.json").write_text("{}", encoding="utf-8") proc.wait = wait @@ -236,14 +279,22 @@ def popen(cmd, **_kwargs): side_effect=popen, ), ): - export_guild_to_json("tok", 1, out) + export_guild_to_json( + "tok", + 1, + out, + channel_ids=[1], + after_date=datetime(2026, 1, 2, tzinfo=timezone.utc), + before_date=datetime(2026, 1, 3, tzinfo=timezone.utc), + ) - par_idx = captured["cmd"].index("--parallel") - assert captured["cmd"][par_idx + 1] == "4" + output_index = captured["cmd"].index("--output") + 1 + output_value = captured["cmd"][output_index] + assert output_value.endswith("1_2026-01-02.json") -def test_export_guild_parallel_clamped_to_16(tmp_path, settings): - settings.DISCORD_CHAT_EXPORTER_PARALLEL = 99 +def test_export_guild_per_channel_parallel_is_one(tmp_path, settings): + settings.DISCORD_CHAT_EXPORTER_PARALLEL = 4 cli = tmp_path / "DiscordChatExporter.Cli.exe" cli.write_text("fake", encoding="utf-8") out = tmp_path / "exp" @@ -255,6 +306,8 @@ def test_export_guild_parallel_clamped_to_16(tmp_path, settings): def wait(): proc.returncode = 0 + out.mkdir(parents=True, exist_ok=True) + (out / "1_2026-01-02.json").write_text("{}", encoding="utf-8") proc.wait = wait @@ -272,10 +325,17 @@ def popen(cmd, **_kwargs): side_effect=popen, ), ): - export_guild_to_json("tok", 1, out) + export_guild_to_json( + "tok", + 1, + out, + channel_ids=[1], + after_date=datetime(2026, 1, 2, tzinfo=timezone.utc), + before_date=datetime(2026, 1, 3, tzinfo=timezone.utc), + ) par_idx = captured["cmd"].index("--parallel") - assert captured["cmd"][par_idx + 1] == "16" + assert captured["cmd"][par_idx + 1] == "1" def test_parse_channels_command_stdout_skips_threads_and_banner(): @@ -356,7 +416,14 @@ def wait(): ), ): with pytest.raises(DiscordChatExporterError, match="SIGKILL"): - export_guild_to_json("tok", 1, out) + export_guild_to_json( + "tok", + 1, + out, + channel_ids=[1], + after_date=datetime(2026, 1, 2, tzinfo=timezone.utc), + before_date=datetime(2026, 1, 3, tzinfo=timezone.utc), + ) def test_sequential_export_skips_channels_cli_when_channel_ids_set(tmp_path, settings): @@ -366,6 +433,9 @@ def test_sequential_export_skips_channels_cli_when_channel_ids_set(tmp_path, set cli.write_text("fake", encoding="utf-8") out = tmp_path / "exp" run_calls: list[list[str]] = [] + day_str = "2026-01-01" + after = datetime(2026, 1, 1, tzinfo=timezone.utc) + before = datetime(2026, 1, 2, tzinfo=timezone.utc) def capture_run(cmd, **_kwargs): run_calls.append(list(cmd)) @@ -384,7 +454,8 @@ def make_popen(cmd, **_kwargs): def wait(): proc.returncode = 0 - (out / f"out-{ch}.json").write_text("{}", encoding="utf-8") + out.mkdir(parents=True, exist_ok=True) + (out / f"{ch}_{day_str}.json").write_text("{}", encoding="utf-8") proc.wait = wait return proc @@ -406,22 +477,24 @@ def wait(): side_effect=make_popen, ), ): - paths = export_guild_to_json( + exports = export_guild_to_json( "tok", 1, out, channel_ids=[222, 111, 222], + after_date=after, + before_date=before, ) assert run_calls == [] - assert len(paths) == 2 + assert len(exports) == 2 def test_export_guild_adds_after_before_flags(tmp_path): cli = tmp_path / "DiscordChatExporter.Cli.exe" cli.write_text("fake", encoding="utf-8") out = tmp_path / "exp" - captured = {} + captured_cmds: list[list[str]] = [] proc = MagicMock() proc.stdout = StringIO("") @@ -429,11 +502,13 @@ def test_export_guild_adds_after_before_flags(tmp_path): def wait(): proc.returncode = 0 + out.mkdir(parents=True, exist_ok=True) + (out / "7_2026-01-02.json").write_text("{}", encoding="utf-8") proc.wait = wait def popen(cmd, **_kwargs): - captured["cmd"] = cmd + captured_cmds.append(list(cmd)) return proc with ( @@ -441,6 +516,9 @@ def popen(cmd, **_kwargs): "discord_activity_tracker.sync.chat_exporter._get_cli_path", return_value=cli, ), + patch( + "discord_activity_tracker.sync.chat_exporter.validate_discord_chat_exporter_cli_architecture", + ), patch( "discord_activity_tracker.sync.chat_exporter.subprocess.Popen", side_effect=popen, @@ -450,12 +528,13 @@ def popen(cmd, **_kwargs): "tok", 7, out, + channel_ids=[7], after_date=datetime(2026, 1, 2, 3, 4, 5, tzinfo=timezone.utc), - before_date=datetime(2026, 2, 1, 0, 0, 0, tzinfo=timezone.utc), + before_date=datetime(2026, 1, 2, 12, 0, 0, tzinfo=timezone.utc), ) - cmd = captured["cmd"] - assert "--after" in cmd and "--before" in cmd + assert captured_cmds + assert any("--after" in cmd and "--before" in cmd for cmd in captured_cmds) def test_parse_exported_json_roundtrip(tmp_path): @@ -655,7 +734,7 @@ def test_export_and_parse_skips_bad_file(tmp_path): with patch( "discord_activity_tracker.sync.chat_exporter.export_guild_to_json", - return_value=[bad], + return_value=[ChannelDayExport(path=bad, day_str="2026-01-01", channel_id=1)], ): assert export_and_parse_guild("t", 1, tmp_path / "o") == [] @@ -671,7 +750,7 @@ def test_export_and_parse_returns_channels(tmp_path): with patch( "discord_activity_tracker.sync.chat_exporter.export_guild_to_json", - return_value=[ok], + return_value=[ChannelDayExport(path=ok, day_str="2026-01-01", channel_id=1)], ): rows = export_and_parse_guild("t", 1, tmp_path / "o") diff --git a/discord_activity_tracker/tests/test_task_discord_sync_coverage.py b/discord_activity_tracker/tests/test_task_discord_sync_coverage.py index 585ff8bd..186b6713 100644 --- a/discord_activity_tracker/tests/test_task_discord_sync_coverage.py +++ b/discord_activity_tracker/tests/test_task_discord_sync_coverage.py @@ -14,12 +14,22 @@ DiscordActivityCollector, task_discord_sync, ) +from discord_activity_tracker.sync.chat_exporter import ChannelDayExport def _phony_token() -> str: return secrets.token_hex(16) +def _channel_day_export( + path, + *, + day_str: str = "2026-01-15", + channel_id: int = 0, +) -> ChannelDayExport: + return ChannelDayExport(path=path, day_str=day_str, channel_id=channel_id) + + def _minimal_envelope(guild_id: int, channel_id: int): msg = { "id": str(10**12 + guild_id + channel_id), @@ -55,6 +65,7 @@ def test_task_discord_sync_skip_returns_early(settings): channel_ids=[], after_date=None, before_date=None, + per_channel_incremental=False, collector=collector, ) @@ -74,6 +85,7 @@ def test_task_discord_sync_dry_run_returns_early(settings): channel_ids=[], after_date=None, before_date=None, + per_channel_incremental=False, collector=collector, ) @@ -95,7 +107,7 @@ def test_task_discord_sync_happy_path_rename_raw(settings, tmp_path, monkeypatch jpath.write_text(json.dumps(_minimal_envelope(gid, cid)), encoding="utf-8") def fake_export(**_kwargs): - return [jpath] + return [_channel_day_export(jpath, day_str="2026-01-15", channel_id=cid)] cmd = MagicMock() cmd.stdout = StringIO() @@ -133,11 +145,13 @@ def fake_export(**_kwargs): channel_ids=[], after_date=datetime(2026, 1, 1, tzinfo=timezone.utc), before_date=None, + per_channel_incremental=False, collector=collector, ) - dest = raw_ch / "2026-01-01.json" + dest = raw_ch / "2026-01-15.json" assert dest.is_file() + assert not jpath.exists() @pytest.mark.django_db @@ -162,7 +176,9 @@ def test_task_discord_sync_skips_channel_not_in_allowlist(settings, tmp_path): with ( patch( "discord_activity_tracker.management.commands.run_discord_activity_tracker.export_guild_to_json", - return_value=[jpath], + return_value=[ + _channel_day_export(jpath, day_str="2026-01-15", channel_id=cid) + ], ), patch( "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_exporter_staging_dir", @@ -188,6 +204,7 @@ def test_task_discord_sync_skips_channel_not_in_allowlist(settings, tmp_path): channel_ids=[999999], after_date=None, before_date=None, + per_channel_incremental=False, collector=collector, ) @@ -218,7 +235,9 @@ def test_task_discord_sync_staging_validation_error_keeps_file( with ( patch( "discord_activity_tracker.management.commands.run_discord_activity_tracker.export_guild_to_json", - return_value=[jpath], + return_value=[ + _channel_day_export(jpath, day_str="2026-01-15", channel_id=cid) + ], ), patch( "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_exporter_staging_dir", @@ -244,6 +263,7 @@ def test_task_discord_sync_staging_validation_error_keeps_file( channel_ids=[], after_date=None, before_date=None, + per_channel_incremental=False, collector=collector, ) assert jpath.is_file() @@ -269,7 +289,9 @@ def test_task_discord_sync_value_error_unlinks(settings, tmp_path): with ( patch( "discord_activity_tracker.management.commands.run_discord_activity_tracker.export_guild_to_json", - return_value=[jpath], + return_value=[ + _channel_day_export(jpath, day_str="2026-01-15", channel_id=cid) + ], ), patch( "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_exporter_staging_dir", @@ -295,6 +317,7 @@ def test_task_discord_sync_value_error_unlinks(settings, tmp_path): channel_ids=[], after_date=None, before_date=None, + per_channel_incremental=False, collector=collector, ) assert not jpath.exists() @@ -327,6 +350,7 @@ def test_task_discord_sync_exporter_error_becomes_command_error(settings, tmp_pa channel_ids=[], after_date=None, before_date=None, + per_channel_incremental=False, collector=collector, ) @@ -352,7 +376,9 @@ def test_task_discord_sync_persist_raises_unlinks(settings, tmp_path): with ( patch( "discord_activity_tracker.management.commands.run_discord_activity_tracker.export_guild_to_json", - return_value=[jpath], + return_value=[ + _channel_day_export(jpath, day_str="2026-01-15", channel_id=cid) + ], ), patch( "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_exporter_staging_dir", @@ -378,6 +404,7 @@ def test_task_discord_sync_persist_raises_unlinks(settings, tmp_path): channel_ids=[], after_date=None, before_date=None, + per_channel_incremental=False, collector=collector, ) assert not jpath.exists() @@ -404,7 +431,9 @@ def test_task_discord_sync_stdout_includes_before_date(settings, tmp_path): with ( patch( "discord_activity_tracker.management.commands.run_discord_activity_tracker.export_guild_to_json", - return_value=[jpath], + return_value=[ + _channel_day_export(jpath, day_str="2026-01-15", channel_id=cid) + ], ), patch( "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_exporter_staging_dir", @@ -430,6 +459,7 @@ def test_task_discord_sync_stdout_includes_before_date(settings, tmp_path): channel_ids=[], after_date=None, before_date=before, + per_channel_incremental=False, collector=collector, ) out = cmd.stdout.getvalue() diff --git a/discord_activity_tracker/tests/test_workspace.py b/discord_activity_tracker/tests/test_workspace.py index 2f037c8b..328ce68b 100644 --- a/discord_activity_tracker/tests/test_workspace.py +++ b/discord_activity_tracker/tests/test_workspace.py @@ -7,6 +7,7 @@ from discord_activity_tracker.workspace import ( get_channel_json_path, get_channel_raw_dir, + get_exporter_staging_dir, get_messages_json_path, get_raw_dir, get_server_dir, @@ -45,6 +46,16 @@ def test_get_channel_raw_dir_nested(settings, tmp_path): assert p.is_dir() +def test_get_exporter_staging_dir_under_workspace_root(mock_discord_workspace): + with patch( + "discord_activity_tracker.workspace.get_workspace_path", + return_value=mock_discord_workspace, + ): + staging = get_exporter_staging_dir() + assert staging == mock_discord_workspace / "_exporter_staging" + assert staging.is_dir() + + def test_get_server_dir(mock_discord_workspace): with patch( "discord_activity_tracker.workspace.get_workspace_path", diff --git a/discord_activity_tracker/utils/__init__.py b/discord_activity_tracker/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/discord_activity_tracker/utils/discord_internal_tokens_store.py b/discord_activity_tracker/utils/discord_internal_tokens_store.py new file mode 100644 index 00000000..21a481c5 --- /dev/null +++ b/discord_activity_tracker/utils/discord_internal_tokens_store.py @@ -0,0 +1,186 @@ +"""Persist Discord session credentials as JSON under workspace/discord_activity_tracker/.""" + +from __future__ import annotations + +import json +import logging +import os +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from django.conf import settings + +from discord_activity_tracker.workspace import get_discord_internal_tokens_json_path + +logger = logging.getLogger(__name__) + +DISCORD_TOKENS_RELOGIN_HINT = "Session credentials invalid or unavailable. Check workspace configuration per .env.example." + + +def discord_internal_tokens_json_path() -> Path: + """Resolved path to the tokens JSON file.""" + override = (getattr(settings, "DISCORD_INTERNAL_TOKENS_JSON", "") or "").strip() + if override: + path = Path(override).expanduser() + if not path.is_absolute(): + path = Path.cwd() / path + return path.resolve() + return get_discord_internal_tokens_json_path().resolve() + + +def _read_document(path: Path) -> dict[str, Any]: + if not path.is_file(): + return {} + raw = path.read_text(encoding="utf-8") + if not raw.strip(): + return {} + data = json.loads(raw) + if not isinstance(data, dict): + raise ValueError(f"Invalid tokens file (expected object): {path}") + return data + + +def _write_document(path: Path, data: dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(".json.tmp") + payload = json.dumps(data, indent=2, sort_keys=True) + tmp.write_text(payload + "\n", encoding="utf-8") + os.replace(tmp, path) + try: + os.chmod(path, 0o600) + except OSError: + pass + logger.debug("Saved Discord session credentials to %s", path) + + +def save_discord_internal_tokens( + user_token: str, + *, + user_id: str | None = None, + username: str | None = None, +) -> Path: + """Write session credential into workspace JSON. Returns path written.""" + user_token = (user_token or "").strip() + if not user_token: + raise ValueError("user_token is required") + + path = discord_internal_tokens_json_path() + entry: dict[str, Any] = { + "user_token": user_token, + "updated_at": datetime.now(timezone.utc).isoformat(), + } + if user_id: + entry["user_id"] = user_id + if username: + entry["username"] = username + _write_document(path, entry) + return path + + +def load_discord_internal_tokens() -> dict[str, str] | None: + """Load credential record, or None if missing.""" + path = discord_internal_tokens_json_path() + try: + doc = _read_document(path) + except (OSError, json.JSONDecodeError, ValueError) as e: + logger.warning( + "Could not read Discord session credentials from %s: %s", path, e + ) + return None + user_token = (doc.get("user_token") or "").strip() + if not user_token: + return None + out: dict[str, str] = {"user_token": user_token} + if doc.get("user_id"): + out["user_id"] = str(doc["user_id"]) + if doc.get("username"): + out["username"] = str(doc["username"]) + return out + + +def extract_and_save_discord_internal_tokens() -> str | None: + """Load credentials from workspace storage and persist to workspace JSON.""" + from discord_activity_tracker.utils.discord_tokens import extract_discord_token_auto + + tokens = extract_discord_token_auto() + if not tokens or "user_token" not in tokens: + return None + save_discord_internal_tokens( + tokens["user_token"], + user_id=tokens.get("user_id"), + username=tokens.get("username"), + ) + return tokens["user_token"] + + +def _allow_internal_discord_tokens() -> bool: + allow = getattr(settings, "ALLOW_INTERNAL_DISCORD_TOKENS", False) + if isinstance(allow, str): + return allow.strip().lower() == "true" + return bool(allow) + + +def get_discord_user_token_from_json() -> str | None: + """Return session credential from workspace JSON when internal mode is enabled.""" + if not _allow_internal_discord_tokens(): + return None + record = load_discord_internal_tokens() + if not record: + return None + return record["user_token"] + + +def log_discord_internal_tokens_still_invalid() -> None: + """Log when session credentials remain invalid after refresh.""" + logger.error( + "Discord session credentials still invalid. %s", + DISCORD_TOKENS_RELOGIN_HINT, + ) + + +def log_discord_internal_tokens_extract_failed() -> None: + """Log when session credentials could not be loaded from workspace storage.""" + logger.error( + "Failed to load Discord session credentials. %s", + DISCORD_TOKENS_RELOGIN_HINT, + ) + + +def _extract_validate_and_return() -> str | None: + """Refresh credentials from workspace storage; return token only if auth probe passes.""" + from discord_activity_tracker.utils.discord_tokens import probe_discord_user_token + + token = extract_and_save_discord_internal_tokens() + if not token: + log_discord_internal_tokens_extract_failed() + return None + if probe_discord_user_token(token): + return token + log_discord_internal_tokens_still_invalid() + return None + + +def get_or_load_discord_user_token() -> str | None: + """ + Return Discord credential for DiscordChatExporter. + + Reads workspace JSON when internal mode is enabled and refreshes when stale. + Otherwise returns credential from settings (.env). + """ + if not _allow_internal_discord_tokens(): + return (getattr(settings, "DISCORD_USER_TOKEN", "") or "").strip() or None + + from discord_activity_tracker.utils.discord_tokens import probe_discord_user_token + + token = get_discord_user_token_from_json() + if token: + if probe_discord_user_token(token): + return token + logger.info("Discord session credentials in JSON are stale; refreshing") + return _extract_validate_and_return() + + logger.info( + "Discord session credentials not in JSON; loading from workspace storage" + ) + return _extract_validate_and_return() diff --git a/discord_activity_tracker/utils/discord_tokens.py b/discord_activity_tracker/utils/discord_tokens.py new file mode 100644 index 00000000..f47f01f4 --- /dev/null +++ b/discord_activity_tracker/utils/discord_tokens.py @@ -0,0 +1,253 @@ +"""Discord session credential helpers for DiscordChatExporter flows.""" + +from __future__ import annotations + +import logging +import re +import shutil +import tempfile +from pathlib import Path + +import requests +from django.conf import settings + +logger = logging.getLogger(__name__) + +DISCORD_USERS_ME_URL = "https://discord.com/api/v9/users/@me" + +# Local storage keys for Discord session credentials. +DISCORD_TOKEN_KEY = b"_https://discord.com\x00\x01token" +DISCORD_TOKEN_KEY_LEGACY = b"_https://discordapp.com\x00\x01token" +DISCORD_TOKEN_MARKER = b"\x01token" + +CHROME_PROFILE_PATH_PATTERN = re.compile(r"^[a-zA-Z0-9/_. \-:]+$") + +# Substrings in DiscordChatExporter stderr that indicate auth failure. +DISCORD_EXPORTER_AUTH_MARKERS = ( + "401", + "403", + "unauthorized", + "Unauthorized", + "invalid token", + "Invalid token", + "not authorized", + "Not authorized", +) + + +def _validate_chrome_profile_path(path: str) -> str: + """Validate DISCORD_CHROME_PROFILE_PATH format. Raises ValueError if invalid.""" + if not path or not isinstance(path, str): + raise ValueError("DISCORD_CHROME_PROFILE_PATH must be a non-empty string") + path = path.strip() + if "\x00" in path: + raise ValueError("DISCORD_CHROME_PROFILE_PATH must not contain null bytes") + normalized = Path(path).as_posix() + if not CHROME_PROFILE_PATH_PATTERN.match(normalized): + raise ValueError( + "DISCORD_CHROME_PROFILE_PATH must contain only path characters " + "(letters, digits, /, _, ., -, space, :), got: %s" % (path[:100],) + ) + return path + + +def _resolve_discord_chrome_profile_root() -> Path: + """Return validated session storage directory for Discord credentials.""" + from discord_activity_tracker.workspace import get_chrome_profile_path + + raw = (getattr(settings, "DISCORD_CHROME_PROFILE_PATH", "") or "").strip() + if not raw: + return get_chrome_profile_path() + validated = _validate_chrome_profile_path(raw) + root = Path(validated).expanduser() + if not root.is_absolute(): + root = Path.cwd() / root + return root.resolve() + + +def _leveldb_path(profile_root: Path) -> Path: + return profile_root / "Default" / "Local Storage" / "leveldb" + + +def _parse_discord_token_raw(raw: bytes) -> str: + """Parse credential value from local storage (strip prefix byte + JSON quotes).""" + if not raw: + raise ValueError("Discord token value is empty") + if raw[0:1] in (b"\x00", b"\x01"): + text = raw[1:].decode("utf-8", errors="replace") + else: + text = raw.decode("utf-8", errors="replace") + text = text.strip() + if len(text) >= 2 and text[0] == '"' and text[-1] == '"': + text = text[1:-1] + token = text.strip() + if not token: + raise ValueError("Discord token value is empty after parsing") + return token + + +def _read_leveldb_value(leveldb_dir: Path, key: bytes) -> bytes | None: + """Read a single key from local storage; copy to temp dir if locked.""" + try: + import plyvel + except ImportError: + logger.warning( + "plyvel is not installed; cannot read session storage at %s. " + "See .env.example for supported environments.", + leveldb_dir, + ) + return None + + keys_to_try = (key,) + + def _get_from_db(db_path: str) -> bytes | None: + db = plyvel.DB(db_path, create_if_missing=False) + try: + for k in keys_to_try: + value = db.get(k) + if value is not None: + return value + for db_key, db_value in db.iterator(): + if DISCORD_TOKEN_MARKER in db_key and db_key.endswith(b"token"): + return db_value + return None + finally: + db.close() + + try: + return _get_from_db(str(leveldb_dir)) + except plyvel.Error as e: + err = str(e).lower() + if "lock" not in err and "resource temporarily unavailable" not in err: + raise + logger.debug("LevelDB locked at %s, copying to temp dir", leveldb_dir) + with tempfile.TemporaryDirectory(prefix="leveldb-") as tmp: + shutil.copytree(leveldb_dir, Path(tmp) / "leveldb", dirs_exist_ok=True) + return _get_from_db(str(Path(tmp) / "leveldb")) + + +def _read_discord_token_from_leveldb(profile_root: Path) -> str | None: + """Load Discord credential from configured session storage.""" + leveldb_dir = _leveldb_path(profile_root) + if not leveldb_dir.is_dir(): + logger.warning("LevelDB not found at %s", leveldb_dir) + return None + for key in (DISCORD_TOKEN_KEY, DISCORD_TOKEN_KEY_LEGACY): + try: + raw = _read_leveldb_value(leveldb_dir, key) + if raw: + return _parse_discord_token_raw(raw) + except ValueError as e: + logger.warning( + "Error parsing Discord credential from session storage: %s", e + ) + continue + except Exception as e: + logger.warning( + "Error reading Discord credential from session storage: %s", e + ) + continue + logger.warning("Discord credential not found in %s", leveldb_dir) + return None + + +def probe_discord_user_token(token: str) -> bool: + """Return True if credential authenticates against Discord GET /users/@me.""" + token = (token or "").strip() + if not token: + return False + try: + response = requests.get( + DISCORD_USERS_ME_URL, + headers={"Authorization": token}, + timeout=30, + ) + if response.status_code == 200: + return True + if response.status_code in (401, 403): + logger.debug( + "Discord token probe auth error: HTTP %s", response.status_code + ) + return False + logger.debug( + "Discord token probe unexpected status %s (treating as invalid)", + response.status_code, + ) + return False + except Exception as e: + logger.debug("Discord token probe request failed: %s", e) + return False + + +def probe_discord_user_token_details(token: str) -> dict | None: + """Return user details from GET /users/@me when credential is valid, else None.""" + token = (token or "").strip() + if not token: + return None + try: + response = requests.get( + DISCORD_USERS_ME_URL, + headers={"Authorization": token}, + timeout=30, + ) + if response.status_code != 200: + return None + data = response.json() + if not isinstance(data, dict): + return None + user_id = str(data.get("id") or "").strip() + username = str(data.get("username") or "").strip() + out: dict[str, str] = {} + if user_id: + out["user_id"] = user_id + if username: + out["username"] = username + return out or None + except Exception as e: + logger.debug("Discord token probe details failed: %s", e) + return None + + +def is_discord_exporter_auth_error(message: str) -> bool: + """True if DiscordChatExporter stderr/message indicates auth failure.""" + text = (message or "").lower() + if not text: + return False + if "401" in message or "403" in message: + return True + for marker in DISCORD_EXPORTER_AUTH_MARKERS: + if marker.lower() in text: + return True + return False + + +def extract_discord_token_auto() -> dict | None: + """Load Discord session credentials from configured workspace paths.""" + logger.debug("Loading Discord session credentials") + try: + profile_root = _resolve_discord_chrome_profile_root() + except ValueError as e: + logger.error("%s", e) + return None + if not profile_root.is_dir(): + logger.error( + "Session storage not found at %s. See .env.example.", + profile_root, + ) + return None + user_token = _read_discord_token_from_leveldb(profile_root) + if not user_token: + logger.error( + "Failed to read Discord credentials from workspace storage. See .env.example." + ) + return None + if not probe_discord_user_token(user_token): + logger.error( + "Discord credentials failed auth probe. Session may be expired or invalid." + ) + return None + result: dict[str, str] = {"user_token": user_token} + details = probe_discord_user_token_details(user_token) + if details: + result.update(details) + return result diff --git a/discord_activity_tracker/workspace.py b/discord_activity_tracker/workspace.py index 7bfde80a..c3335db7 100644 --- a/discord_activity_tracker/workspace.py +++ b/discord_activity_tracker/workspace.py @@ -1,4 +1,11 @@ -"""Workspace utilities - path helpers for raw export JSON and per-server data.""" +""" +Workspace utilities - path helpers for raw export JSON and per-server data. + +Layout: workspace/discord_activity_tracker/ + - chrome_profile/ (session storage for exporter credentials) + - discord_internal_tokens.json (session credentials, not .env) + - _exporter_staging/ (temporary DiscordChatExporter output; cleared each run) +""" from pathlib import Path @@ -10,6 +17,8 @@ # Pre-exported DiscordChatExporter JSON dropped here for DB import (see backfill command). CPP_DISCUSSION_IMPORT_SUBDIR = "Discussion - c-cpp-discussion" +CHROME_PROFILE_DIRNAME = "chrome_profile" +DISCORD_INTERNAL_TOKENS_FILENAME = "discord_internal_tokens.json" def get_workspace_root() -> Path: @@ -17,6 +26,18 @@ def get_workspace_root() -> Path: return get_workspace_path(_APP_SLUG) +def get_chrome_profile_path() -> Path: + """Session storage directory for Discord exporter credentials.""" + path = get_workspace_root() / CHROME_PROFILE_DIRNAME + path.mkdir(parents=True, exist_ok=True) + return path + + +def get_discord_internal_tokens_json_path() -> Path: + """JSON file storing Discord session credentials.""" + return get_workspace_root() / DISCORD_INTERNAL_TOKENS_FILENAME + + def get_cpp_discussion_import_dir() -> Path: """Return workspace/discord_activity_tracker/Discussion - c-cpp-discussion/ (creates if missing).""" path = get_workspace_root() / CPP_DISCUSSION_IMPORT_SUBDIR @@ -32,8 +53,8 @@ def get_raw_dir() -> Path: def get_exporter_staging_dir() -> Path: - """Temporary directory for DiscordChatExporter guild output before per-channel archival.""" - path = get_raw_dir() / "_exporter_staging" + """Temporary directory for DiscordChatExporter output before per-day archival.""" + path = get_workspace_root() / "_exporter_staging" path.mkdir(parents=True, exist_ok=True) return path diff --git a/docker-compose.yml b/docker-compose.yml index cb105a5b..1d19806e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,6 +1,6 @@ # Boost Data Collector - Docker Compose # Runs: Redis, Django (gunicorn), Celery worker, Celery beat. -# Optional: slack-chromium (profile slack-session) for noVNC Slack login on headless hosts. +# Optional: slack-chromium (profile slack-session) / discord-chromium (discord-session) for noVNC login. # DATABASE_URL must be set in .env (host Postgres, or postgres://...@db:5432/... if you enable db). services: @@ -45,6 +45,20 @@ services: SE_BROWSER_ARGS_NO_SANDBOX: "--no-sandbox" SE_BROWSER_ARGS_DISABLE_DEV_SHM: "--disable-dev-shm-usage" + discord-chromium: + profiles: ["discord-session"] + image: selenium/standalone-chrome:145.0-chromedriver-145.0-20260222 + platform: linux/amd64 + shm_size: "2g" + ports: + - "127.0.0.1:7901:7900" + volumes: + - ./workspace/discord_activity_tracker/chrome_profile:/home/seluser/.config/google-chrome + environment: + SE_BROWSER_ARGS_USER_DATA_DIR: "--user-data-dir=/home/seluser/.config/google-chrome" + SE_BROWSER_ARGS_NO_SANDBOX: "--no-sandbox" + SE_BROWSER_ARGS_DISABLE_DEV_SHM: "--disable-dev-shm-usage" + web: build: . restart: unless-stopped @@ -61,6 +75,7 @@ services: CELERY_RESULT_BACKEND: redis://redis:6379/0 ALLOWED_HOSTS: ${ALLOWED_HOSTS:-localhost,127.0.0.1,web,0.0.0.0} CHROME_PROFILE_PATH: /app/workspace/slack_event_handler/chrome_profile + DISCORD_CHROME_PROFILE_PATH: /app/workspace/discord_activity_tracker/chrome_profile volumes: - ./workspace:/app/workspace - ./logs:/app/logs @@ -94,6 +109,7 @@ services: CELERY_BROKER_URL: redis://redis:6379/0 CELERY_RESULT_BACKEND: redis://redis:6379/0 CHROME_PROFILE_PATH: /app/workspace/slack_event_handler/chrome_profile + DISCORD_CHROME_PROFILE_PATH: /app/workspace/discord_activity_tracker/chrome_profile volumes: - ./workspace:/app/workspace - ./logs:/app/logs diff --git a/docs/Architecture_overview.md b/docs/Architecture_overview.md index b3a1fe33..21162931 100644 --- a/docs/Architecture_overview.md +++ b/docs/Architecture_overview.md @@ -74,7 +74,7 @@ Columns: **persistence** (usual durable stores), **coupling** (one-line upstream | App | Entry command | Notes | |-----|---------------|-------| -| `slack_event_handler` | `run_slack_event_handler` | Slack Socket Mode listener (PR bot / huddles); see [Docker.md §4b](Docker.md#4b-slack-session-tokens-huddle-transcripts-optional) | +| `slack_event_handler` | `run_slack_event_handler` | Slack Socket Mode listener (PR bot / huddles) | --- diff --git a/docs/Deployment.md b/docs/Deployment.md index 9a0a567e..0b127839 100644 --- a/docs/Deployment.md +++ b/docs/Deployment.md @@ -435,14 +435,6 @@ server { Reload nginx after testing config (`sudo nginx -t && sudo systemctl reload nginx`). -**Slack login (optional, port 7900):** When using `docker compose --profile slack-session`, noVNC is bound to **`127.0.0.1:7900`**. Access from your laptop via SSH port forwarding: - -```bash -ssh -L 7900:127.0.0.1:7900 YOUR_DEPLOY_USER@YOUR_SERVER_HOST -``` - -Then open **http://localhost:7900**, sign in to Slack, stop `slack-chromium`, and run `docker compose run --rm web python manage.py extract_slack_tokens` (from the repo root on the server; same as `make extract-slack-tokens`). If you run **`run_slack_event_handler`** outside Compose, mount the same `workspace` path and `.env` as the Docker stack. - --- ## Deploy Script Behavior diff --git a/docs/Docker.md b/docs/Docker.md index 6db28799..346fcfa9 100644 --- a/docs/Docker.md +++ b/docs/Docker.md @@ -115,65 +115,6 @@ After this, the app is ready to use. --- -## 4b. Slack session tokens (huddle transcripts, optional) - -Huddle flows may need internal Slack session tokens with `ALLOW_INTERNAL_SLACK_TOKENS=true`. Tokens are stored in `workspace/slack_event_handler/slack_internal_tokens.json` (not `.env`). Extraction reads a Chrome profile under `workspace/slack_event_handler/chrome_profile` (see `CHROME_PROFILE_PATH` in `.env.example`). - -**Windows (native venv):** `plyvel` is omitted from locked dependencies on Windows (LevelDB C++ headers required to build). Use Docker/WSL/Linux for `manage.py extract_slack_tokens` when LevelDB (`localConfig_v2`) must be read; cookie/SQLite paths may still work on Windows for xoxd only. - -**Headless server (no UI on the host):** - -1. Start the optional login container (persists Chrome data under `workspace/slack_event_handler/chrome_profile`): - - ```bash - make slack-login - # or: docker compose --profile slack-session up -d slack-chromium - ``` - - The optional **`slack-chromium`** service runs **`selenium/standalone-chrome`** as an **interactive noVNC desktop** (manual Slack login only — not the project's old Selenium-driven token flow). Chrome's user-data directory in the container is `/home/seluser/.config/google-chrome`; Compose bind-mounts that path to **`CHROME_PROFILE_PATH`** on the host (default: `workspace/slack_event_handler/chrome_profile`). - -2. From your laptop, SSH tunnel noVNC (port **7900**): - - ```bash - ssh -L 7900:127.0.0.1:7900 YOUR_USER@YOUR_SERVER - ``` - - Open **http://localhost:7900** and sign in at **https://app.slack.com**. - -3. Stop the login container before extraction (avoids LevelDB locks): - - ```bash - docker compose --profile slack-session stop slack-chromium - ``` - -4. Extract tokens into workspace JSON (read by running web/celery via the mounted workspace volume): - - ```bash - make extract-slack-tokens # writes workspace/slack_event_handler/slack_internal_tokens.json - ``` - - **One-shot (login → wait → extract):** - - ```bash - make slack-tokens-refresh - ``` - - While `slack-wait-profile` runs, sign in at **http://127.0.0.1:7900**. If the profile already exists: - - ```bash - make slack-tokens-reextract - ``` - -**Token usage at runtime:** `fetch_huddle_transcript` reads `workspace/slack_event_handler/slack_internal_tokens.json`. If JSON is missing or tokens fail an auth probe, it re-extracts from `CHROME_PROFILE_PATH` and updates JSON automatically (no `make slack-tokens-reextract`, no scripted browser navigation). If the Chrome session itself expired, use `make slack-tokens-refresh` (manual noVNC login). - -| Command | When to use | -|---------|-------------| -| `make extract-slack-tokens` | Write/update token JSON from Chrome profile | -| `make slack-tokens-reextract` | Profile already logged in; extract JSON only | -| `make slack-tokens-refresh` | First-time or expired session (noVNC login, then extract) | - ---- - ## 5. Open the app In your browser go to: diff --git a/docs/GCP_Production_Checklist.md b/docs/GCP_Production_Checklist.md index 018ab779..c9239114 100644 --- a/docs/GCP_Production_Checklist.md +++ b/docs/GCP_Production_Checklist.md @@ -27,7 +27,7 @@ Mirror [`.env.example`](../.env.example) groups; inject via Secret Manager → e |----------|----------------| | GitHub | `GITHUB_TOKEN`, `GITHUB_TOKENS_SCRAPING`, `GITHUB_TOKEN_WRITE` | | Slack | `SLACK_TEAM_IDS`, `SLACK_BOT_TOKEN_*`, `SLACK_APP_TOKEN_*` | -| Discord | `DISCORD_TOKEN` or `DISCORD_USER_TOKEN`, `DISCORD_SERVER_ID`, exporter paths | +| Discord | `DISCORD_TOKEN`, `DISCORD_SERVER_ID`, exporter paths | | Pinecone | `PINECONE_API_KEY`, `PINECONE_INDEX_NAME`, … | | YouTube | `YOUTUBE_API_KEY` | | WG21 | `WG21_GITHUB_DISPATCH_*` (see `config/settings.py`) | diff --git a/docs/Workspace.md b/docs/Workspace.md index 4e061d2c..b9f64c08 100644 --- a/docs/Workspace.md +++ b/docs/Workspace.md @@ -21,13 +21,13 @@ workspace/ # WORKSPACE_DIR (configurable via │ └── boost_mailing_list_tracker/ # Raw API responses (kept, not removed) │ └── /.json │ └── discord_activity_tracker/ # DiscordChatExporter output (see below) -│ ├── _exporter_staging/ # Temporary guild export (cleared each run) -│ └── // # Archived JSON after DB import +│ └── // # Archived JSON after DB import (YYYY-MM-DD.json) ├── clang_github_tracker/ # Markdown export for clang_github_tracker (md_export/) ├── boost_mailing_list_tracker/ # Mailing list messages (see below) │ └── / │ └── messages/.json # Formatted cache (processed then removed) ├── discord_activity_tracker/ # CLI install + backfill drop folder +│ ├── _exporter_staging/ # Temporary per-day export (cleared each run) │ ├── script/ # DiscordChatExporter.Cli (default layout; optional) │ └── Discussion - c-cpp-discussion/ # Pre-exported JSON for backfill (removed after import) └── shared/ # Temp files used by more than one app @@ -51,7 +51,7 @@ So: **raw/** = permanent archive of scraped API responses; **messages/** = short ### discord_activity_tracker paths -1. **`run_discord_activity_tracker`** — DiscordChatExporter writes to `raw/discord_activity_tracker/_exporter_staging/`, then JSON is parsed, upserted into the DB, and **moved** under `raw/discord_activity_tracker///` (kept as an archive). +1. **`run_discord_activity_tracker`** — DiscordChatExporter runs **per channel per UTC day**, writing scratch JSON under `discord_activity_tracker/_exporter_staging/`. Each file is parsed, upserted into the DB, then **merged** into `raw/discord_activity_tracker///YYYY-MM-DD.json` (same-day re-runs append/update by message id). 2. **`backfill_discord_activity_tracker`** — Place DiscordChatExporter JSON under `discord_activity_tracker/Discussion - c-cpp-discussion/` (any depth). Each file is imported, then **deleted** so it is not processed twice. See [service_api/discord_activity_tracker.md](service_api/discord_activity_tracker.md) and [operations/discord_chat_exporter.md](operations/discord_chat_exporter.md). diff --git a/docs/operations/discord_chat_exporter.md b/docs/operations/discord_chat_exporter.md index f7a8640b..0b0f1719 100644 --- a/docs/operations/discord_chat_exporter.md +++ b/docs/operations/discord_chat_exporter.md @@ -2,49 +2,7 @@ This project uses **[DiscordChatExporter](https://github.com/Tyrrrz/DiscordChatExporter)** (CLI), not a separate product named “DiscordExpert.” The GUI and CLI come from the same Tyrrrz releases; ingestion here runs the **CLI** only (`export`, `exportguild`, `channels`), driven by `discord_activity_tracker/sync/chat_exporter.py` and `manage.py run_discord_activity_tracker`. -Using a **user token** with DiscordChatExporter may violate Discord’s Terms of Service; prefer official APIs / bots when possible. Document tokens securely and never commit them. - ---- - -## Discord token and IDs (for fetching) - -`manage.py run_discord_activity_tracker` reads **`DISCORD_USER_TOKEN`** from `.env`. DiscordChatExporter uses the same kind of value as its CLI **`-t`** / `--token` argument when talking to Discord. - -### Where to get the token - -Discord does **not** publish a supported “export my user token” flow for this use case. **Follow the maintained upstream guide** (it is updated when the Discord client or API changes): - -- **[Token and IDs](https://github.com/Tyrrrz/DiscordChatExporter/blob/master/.docs/Token-and-IDs.md)** — how to obtain a token and copy **server** / **channel** snowflake IDs. - -**Built-in CLI help** (after you install the binary, see below): - -- **macOS / Linux:** `./DiscordChatExporter.Cli guide` -- **Windows (`cmd`):** `DiscordChatExporter.Cli.exe guide` (no leading `./`) - -That command prints the same class of instructions as the wiki. - -Put the token in **`.env`** (never commit it): - -```env -DISCORD_USER_TOKEN=your_token_here -``` - -### Bot token vs user token - -| Item | Env var in this repo | Used by ChatExporter fetch? | -|------|----------------------|----------------------------| -| **User** (account) token | `DISCORD_USER_TOKEN` | **Yes** — required for `run_discord_activity_tracker` → DiscordChatExporter. | -| **Bot** token from the [Developer Portal](https://discord.com/developers/applications) | `DISCORD_TOKEN` | **No** for this exporter path (reserved for other / future bot-based features). | - -If export fails with “unauthorized” or similar, double-check you pasted the **user** token, not a bot token, and that the value has no extra quotes or spaces. - -### Server and channel IDs - -Set **`DISCORD_SERVER_ID`** to the guild you want. Optionally set **`DISCORD_CHANNEL_IDS`** to a comma-separated list of channel snowflakes; leave empty when you want the exporter to include all relevant channels (see behavior in [service_api/discord_activity_tracker.md](../service_api/discord_activity_tracker.md)). **Developer Mode** in the Discord app (Settings → Advanced) enables **Copy ID** on servers and channels; details are in **Token and IDs** above. - -### If the token leaks - -Treat it like a password. Revoke or rotate it using the same upstream steps you used to obtain it, and follow [Discord’s account security guidance](https://support.discord.com/hc/en-us/categories/360001371893-Account-Security-Verification). Do not paste tokens into chat, tickets, or screenshots. +Exporter credentials and Discord server/channel IDs are configured via `.env` (see `.env.example`). User-account automation may violate Discord’s Terms of Service; prefer official APIs and bots when possible. --- @@ -84,7 +42,6 @@ All variables live in `.env` (see `.env.example` in the repo root). The ones tha | Variable | Purpose | |----------|---------| -| `DISCORD_USER_TOKEN` | Token passed to DiscordChatExporter for export (required for `run_discord_activity_tracker` fetch path). | | `DISCORD_SERVER_ID` | Guild snowflake to export. | | `DISCORD_CHANNEL_IDS` | Optional comma-separated channel IDs; empty often means “all text channels” depending on exporter mode. | | `DISCORD_CHAT_EXPORTER_CLI` | Optional absolute path to `DiscordChatExporter.Cli` / `.exe` if not using `workspace/.../script/`. | diff --git a/docs/service_api/discord_activity_tracker.md b/docs/service_api/discord_activity_tracker.md index 79440325..37a08947 100644 --- a/docs/service_api/discord_activity_tracker.md +++ b/docs/service_api/discord_activity_tracker.md @@ -60,10 +60,11 @@ Collectors, management commands, and sync layers classify failures with [`classi | Module / symbol | Role | | --------------- | ---- | -| `sync/chat_exporter.py` | Runs **DiscordChatExporter** (`exportguild`, etc.), date bounds in UTC, filters JSON paths. Used by **`run_discord_activity_tracker`**. | -| `sync/messages.py` | `_prepare_message_data`, `_process_messages_in_batches` (calls `bulk_process_message_batch`). Also exposes **discord.py** helpers (`DiscordSyncClient`, `sync_all_channels`, …) for Bot API–style sync; those entry points are **not** wired to `run_discord_activity_tracker` today (that command uses the exporter + user token only). | +| `sync/chat_exporter.py` | Runs **DiscordChatExporter** per channel per UTC day (`export`), date bounds in UTC. Used by **`run_discord_activity_tracker`**. | +| `sync/raw_archive.py` | `merge_exporter_json` — merge daily JSON archives by message id under `raw/discord_activity_tracker/`. | +| `sync/messages.py` | `_prepare_message_data`, `_process_messages_in_batches` (calls `bulk_process_message_batch`). Also exposes **discord.py** helpers (`DiscordSyncClient`, `sync_all_channels`, …) for Bot API–style sync; those entry points are **not** wired to `run_discord_activity_tracker` today (that command uses the DiscordChatExporter CLI only). | | `sync/client.py` | `DiscordSyncClient` — discord.py wrapper (intents, fetch guild/channel/messages). | -| `sync/exporter_window.py` | `latest_message_created_at_for_guild` — lower bound for incremental exporter runs when `--since` is omitted. | +| `sync/exporter_window.py` | `latest_message_created_at_for_guild`, `iter_channel_export_days` — DB lower bound and UTC day windows for exporter runs. | | `sync/utils.py` | Parsing helpers shared by exporter and message pipelines. | | `sync/export.py` | Markdown export from DB (used downstream of sync; see command help for `DISCORD_CONTEXT_*` settings). | @@ -75,13 +76,13 @@ Two management commands handle message ingestion. Both use **`AbstractCollector` ### `run_discord_activity_tracker` — incremental / scheduled -Uses **DiscordChatExporter** CLI with the user token. Setup (download, install path, env vars): [DiscordChatExporter operations doc](../operations/discord_chat_exporter.md). +Uses **DiscordChatExporter** CLI with configured exporter credentials. Setup (download, install path, env vars): [DiscordChatExporter operations doc](../operations/discord_chat_exporter.md). Fetches into a staging directory, persists to the database, then archives JSON under: `{WORKSPACE_DIR}/raw/discord_activity_tracker///` -Date bounds passed to the exporter use **UTC** (see `sync/chat_exporter.py`). When `--since` is omitted, the lower bound is the latest stored message time for this guild (and channel allowlist). If the database has no matching rows, no `--after` filter is applied (full history). When `--until` is omitted, there is no upper bound (export through the present). If `--since` and `--until` are both set but **since is after until**, the command logs a warning and treats both as unset, then recomputes bounds from the rules above. +DiscordChatExporter runs **once per channel per UTC calendar day** in the resolved window. Date bounds use **UTC** (see `sync/chat_exporter.py` and `sync/exporter_window.py`). When `--since` is omitted, the lower bound is the latest stored message time for this guild (and channel allowlist). If the database has no matching rows, only **today (UTC)** is exported. When `--until` is omitted, there is no upper bound (export through the present). Raw archives are stored as `YYYY-MM-DD.json` per channel; later runs **merge** new messages into the same file by message id. If `--since` and `--until` are both set but **since is after until**, the command logs a warning and treats both as unset, then recomputes bounds from the rules above. ``` python manage.py run_discord_activity_tracker [options] diff --git a/scripts/clean-macos.sh b/scripts/clean-macos.sh index 0b82ccca..cefe154b 100644 --- a/scripts/clean-macos.sh +++ b/scripts/clean-macos.sh @@ -11,12 +11,12 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ROOT="${1:-$(cd "$SCRIPT_DIR/.." && pwd)}" echo "Scanning for macOS ._* files under: $ROOT" -COUNT=$(find "$ROOT" -name '._*' -not -path '*/.git/*' 2>/dev/null | wc -l | tr -d ' ') +COUNT=$(find "$ROOT" -name '._*' 2>/dev/null | wc -l | tr -d ' ') if [ "$COUNT" -eq 0 ]; then echo "No ._* files found. Nothing to clean." exit 0 fi -find "$ROOT" -name '._*' -not -path '*/.git/*' -delete 2>/dev/null +find "$ROOT" -name '._*' -delete 2>/dev/null echo "Removed $COUNT ._* file(s)." diff --git a/scripts/wait_discord_chrome_profile.sh b/scripts/wait_discord_chrome_profile.sh new file mode 100644 index 00000000..88e1cc64 --- /dev/null +++ b/scripts/wait_discord_chrome_profile.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# Wait until discord-chromium has written a usable Chrome profile (Cookies + LevelDB). +set -euo pipefail + +PROFILE_ROOT="${1:-workspace/discord_activity_tracker/chrome_profile}" +COOKIES="${PROFILE_ROOT}/Default/Cookies" +LEVELDB="${PROFILE_ROOT}/Default/Local Storage/leveldb" +TIMEOUT_SEC="${DISCORD_PROFILE_WAIT_TIMEOUT:-600}" +INTERVAL_SEC="${DISCORD_PROFILE_WAIT_INTERVAL:-5}" + +if ! [[ "${TIMEOUT_SEC}" =~ ^[0-9]+$ ]] || ! [[ "${INTERVAL_SEC}" =~ ^[0-9]+$ ]]; then + echo "DISCORD_PROFILE_WAIT_TIMEOUT and DISCORD_PROFILE_WAIT_INTERVAL must be non-negative integers." >&2 + exit 1 +fi +if (( TIMEOUT_SEC <= 0 || INTERVAL_SEC <= 0 )); then + echo "DISCORD_PROFILE_WAIT_TIMEOUT and DISCORD_PROFILE_WAIT_INTERVAL must be > 0." >&2 + exit 1 +fi + +deadline=$((SECONDS + TIMEOUT_SEC)) +echo "Waiting for Discord Chrome profile under ${PROFILE_ROOT}" +echo " Sign in at http://127.0.0.1:7901 → https://discord.com" +echo " Timeout: ${TIMEOUT_SEC}s (override with DISCORD_PROFILE_WAIT_TIMEOUT)" + +while (( SECONDS < deadline )); do + if [[ -f "${COOKIES}" && -s "${COOKIES}" && -d "${LEVELDB}" ]]; then + if compgen -G "${LEVELDB}/*" > /dev/null; then + echo "Profile ready (${COOKIES}, ${LEVELDB})." + exit 0 + fi + fi + sleep "${INTERVAL_SEC}" +done + +echo "Timed out waiting for Chrome profile. Check noVNC login and discord-chromium logs." >&2 +exit 1 diff --git a/slack_event_handler/management/commands/extract_slack_tokens.py b/slack_event_handler/management/commands/extract_slack_tokens.py index 9b98ccf4..8ef0726a 100644 --- a/slack_event_handler/management/commands/extract_slack_tokens.py +++ b/slack_event_handler/management/commands/extract_slack_tokens.py @@ -1,7 +1,7 @@ """ Management command: extract_slack_tokens -Reads xoxc/xoxd from CHROME_PROFILE_PATH and saves them to workspace JSON. +Persist Slack session credentials to workspace JSON. """ import logging @@ -22,9 +22,8 @@ class Command(BaseCommand): help = ( - "Extract Slack xoxc/xoxd tokens from CHROME_PROFILE_PATH and write " - "workspace/slack_event_handler/slack_internal_tokens.json. " - "Stop slack-chromium (slack-session profile) before running to avoid LevelDB locks." + "Persist Slack session credentials to " + "workspace/slack_event_handler/slack_internal_tokens.json." ) def add_arguments(self, parser): @@ -50,9 +49,9 @@ def handle(self, *args, **options): if not allow: self.stderr.write( self.style.WARNING( - "ALLOW_INTERNAL_SLACK_TOKENS is not true: tokens will be saved to " + "Internal Slack session mode is not enabled: credentials will be saved to " "workspace JSON but ignored by Django until enabled. " - "Restart web/celery after enabling." + "Restart web/celery after enabling. See .env.example." ) ) @@ -63,20 +62,17 @@ def handle(self, *args, **options): profile_path = str(profile) if not profile.is_dir(): raise CommandError( - "Chrome profile not found at CHROME_PROFILE_PATH " + "Session storage not found " f"({profile_path}). Expected: {get_chrome_profile_path()}. " - "Log into Slack via make slack-login, then re-run extract_slack_tokens." + "See .env.example." ) pair = extract_and_save_slack_internal_tokens(team_id) if not pair: - raise CommandError( - "Token extraction failed. Ensure Slack is logged in under " - f"CHROME_PROFILE_PATH ({profile_path}) and slack-chromium is stopped." - ) + raise CommandError("Failed to load session credentials. See .env.example.") out_path = slack_internal_tokens_json_path() self.stdout.write( self.style.SUCCESS( - f"Extracted tokens for team {team_id}; saved to {out_path}." + f"Saved session credentials for team {team_id} to {out_path}." ) ) diff --git a/slack_event_handler/tests/test_extract_slack_tokens_command.py b/slack_event_handler/tests/test_extract_slack_tokens_command.py index 1386f2a7..5fe65dc0 100644 --- a/slack_event_handler/tests/test_extract_slack_tokens_command.py +++ b/slack_event_handler/tests/test_extract_slack_tokens_command.py @@ -24,7 +24,7 @@ def test_extract_slack_tokens_command_success( out = StringIO() call_command("extract_slack_tokens", "--team-id=T1", stdout=out) mock_extract_and_save.assert_called_once_with("T1") - assert "saved to" in out.getvalue() + assert "Saved session credentials" in out.getvalue() @patch( @@ -40,12 +40,12 @@ def test_extract_slack_tokens_command_failure( profile = tmp_path / "chrome_profile" profile.mkdir() mock_resolve_profile.return_value = profile - with pytest.raises(CommandError, match="Token extraction failed"): + with pytest.raises(CommandError, match="Failed to load session credentials"): call_command("extract_slack_tokens", "--team-id=T1") mock_extract_and_save.assert_called_once_with("T1") def test_extract_slack_tokens_command_missing_profile(settings, tmp_path): settings.CHROME_PROFILE_PATH = str(tmp_path / "missing_profile") - with pytest.raises(CommandError, match="Chrome profile not found"): + with pytest.raises(CommandError, match="Session storage not found"): call_command("extract_slack_tokens", "--team-id=T21Q22G66") diff --git a/slack_event_handler/tests/test_slack_internal_tokens_store.py b/slack_event_handler/tests/test_slack_internal_tokens_store.py index 6755eab2..667d3be3 100644 --- a/slack_event_handler/tests/test_slack_internal_tokens_store.py +++ b/slack_event_handler/tests/test_slack_internal_tokens_store.py @@ -106,7 +106,7 @@ def test_get_or_load_logs_when_reextracted_tokens_still_invalid( assert pair is None mock_extract.assert_called_once_with("T1") assert "still invalid" in caplog.text - assert "slack-tokens-refresh" in caplog.text + assert ".env.example" in caplog.text @override_settings(ALLOW_INTERNAL_SLACK_TOKENS=True, WORKSPACE_DIR="/tmp/ws") diff --git a/slack_event_handler/utils/slack_internal_tokens_store.py b/slack_event_handler/utils/slack_internal_tokens_store.py index 73096137..43329e5f 100644 --- a/slack_event_handler/utils/slack_internal_tokens_store.py +++ b/slack_event_handler/utils/slack_internal_tokens_store.py @@ -1,6 +1,4 @@ -""" -Persist Slack internal session tokens (xoxc/xoxd) as JSON under workspace/slack_event_handler/. -""" +"""Persist Slack session credentials as JSON under workspace/slack_event_handler/.""" from __future__ import annotations @@ -17,12 +15,7 @@ logger = logging.getLogger(__name__) -# Shown when JSON/profile tokens fail auth even after re-extract from CHROME_PROFILE_PATH. -SLACK_TOKENS_RELOGIN_HINT = ( - "Chrome session may be expired, profile missing, or slack-chromium is still running " - "(LevelDB lock). Log in via make slack-tokens-refresh (noVNC http://127.0.0.1:7900) " - "or make slack-tokens-reextract if the profile is already signed in." -) +SLACK_TOKENS_RELOGIN_HINT = "Session credentials invalid or unavailable. Check workspace configuration per .env.example." def slack_internal_tokens_json_path() -> Path: @@ -74,7 +67,7 @@ def save_slack_internal_tokens( team_name: str | None = None, user_id: str | None = None, ) -> Path: - """Write xoxc/xoxd for team_id into workspace JSON. Returns path written.""" + """Write session credentials for team_id into workspace JSON. Returns path written.""" team_id = (team_id or "").strip() xoxc = (xoxc or "").strip() xoxd = (xoxd or "").strip() @@ -124,7 +117,7 @@ def load_slack_internal_tokens(team_id: str) -> dict[str, str] | None: def extract_and_save_slack_internal_tokens(team_id: str) -> tuple[str, str] | None: - """Read xoxc/xoxd from CHROME_PROFILE_PATH and persist to workspace JSON.""" + """Load session credentials from workspace storage and persist to workspace JSON.""" from slack_event_handler.utils.slack_tokens import extract_slack_tokens_auto tokens = extract_slack_tokens_auto(team_id) @@ -141,11 +134,7 @@ def extract_and_save_slack_internal_tokens(team_id: str) -> tuple[str, str] | No def get_slack_internal_token_pair(team_id: str | None = None) -> tuple[str, str] | None: - """ - Return (xoxc, xoxd) when ALLOW_INTERNAL_SLACK_TOKENS is enabled. - - Reads workspace JSON (not .env). Pass team_id or uses default from SLACK_TEAM_IDS. - """ + """Return session credential pair from workspace JSON when internal mode is enabled.""" allow = getattr(settings, "ALLOW_INTERNAL_SLACK_TOKENS", False) if isinstance(allow, str): allow = allow.strip().lower() == "true" @@ -176,26 +165,25 @@ def _resolve_team_id(team_id: str | None = None) -> str: def log_slack_internal_tokens_still_invalid(team_id: str) -> None: - """Log a clear error when tokens remain invalid after re-extract from Chrome profile.""" + """Log when session credentials remain invalid after refresh.""" logger.error( - "Slack internal tokens still invalid for team %s after re-extract from " - "CHROME_PROFILE_PATH. %s", + "Slack session credentials still invalid for team %s. %s", team_id, SLACK_TOKENS_RELOGIN_HINT, ) def log_slack_internal_tokens_extract_failed(team_id: str) -> None: - """Log a clear error when token extraction from Chrome profile fails.""" + """Log when session credentials could not be loaded from workspace storage.""" logger.error( - "Failed to extract Slack internal tokens from CHROME_PROFILE_PATH for team %s. %s", + "Failed to load Slack session credentials for team %s. %s", team_id, SLACK_TOKENS_RELOGIN_HINT, ) def _extract_validate_and_return(team_id: str) -> tuple[str, str] | None: - """Re-extract from profile, save JSON, and return pair only if auth probe passes.""" + """Refresh credentials from workspace storage; return pair only if auth probe passes.""" from slack_event_handler.utils.slack_tokens import probe_slack_internal_tokens pair = extract_and_save_slack_internal_tokens(team_id) @@ -212,11 +200,10 @@ def get_or_load_slack_internal_token_pair( team_id: str | None = None, ) -> tuple[str, str] | None: """ - Return (xoxc, xoxd) from workspace JSON. + Return session credential pair from workspace JSON. - If JSON is missing, or stored tokens fail an auth probe, re-extract from - CHROME_PROFILE_PATH and update JSON automatically (no manual make target). - Returns None and logs a clear error if re-extracted tokens are still invalid. + Refreshes from workspace storage when JSON is missing or credentials fail auth probe. + Returns None if credentials remain invalid. """ from slack_event_handler.utils.slack_tokens import probe_slack_internal_tokens @@ -235,14 +222,13 @@ def get_or_load_slack_internal_token_pair( if probe_slack_internal_tokens(pair[0], pair[1]): return pair logger.info( - "Slack internal tokens in JSON are stale for team %s; " - "re-extracting from Chrome profile", + "Slack session credentials in JSON are stale for team %s; refreshing", tid, ) return _extract_validate_and_return(tid) logger.info( - "Slack internal tokens not in JSON; extracting from Chrome profile for team %s", + "Slack session credentials not in JSON; loading for team %s", tid, ) return _extract_validate_and_return(tid) diff --git a/slack_event_handler/utils/slack_tokens.py b/slack_event_handler/utils/slack_tokens.py index 2bfa8da1..75a50a98 100644 --- a/slack_event_handler/utils/slack_tokens.py +++ b/slack_event_handler/utils/slack_tokens.py @@ -1,7 +1,4 @@ -""" -Slack Token Extractor Module -Reads xoxc and xoxd tokens from a logged-in Chrome user profile on disk. -""" +"""Slack session credential helpers for huddle transcript flows.""" import json import logging @@ -15,7 +12,7 @@ logger = logging.getLogger(__name__) -# Slack files.info errors that indicate stale xoxc/xoxd (not missing file). +# Slack files.info errors that indicate stale session credentials (not missing file). SLACK_INTERNAL_TOKEN_AUTH_ERRORS = frozenset( { "invalid_auth", @@ -34,7 +31,7 @@ LOCAL_CONFIG_V2_KEY = b"_https://app.slack.com\x00\x01localConfig_v2" LOCAL_CONFIG_V2_MARKER = b"localConfig_v2" -# Chrome profile path: validate normalized POSIX form (Windows drive letters via ":"). +# Session storage path: validate normalized POSIX form (Windows drive letters via ":"). CHROME_PROFILE_PATH_PATTERN = re.compile(r"^[a-zA-Z0-9/_. \-:]+$") @@ -55,7 +52,7 @@ def _validate_chrome_profile_path(path: str) -> str: def _resolve_chrome_profile_root() -> Path: - """Return validated Chrome user-data directory (workspace/slack_event_handler/chrome_profile).""" + """Return validated session storage directory for Slack credentials.""" from slack_event_handler.workspace import get_chrome_profile_path raw = (getattr(settings, "CHROME_PROFILE_PATH", "") or "").strip() @@ -77,7 +74,7 @@ def _cookies_path(profile_root: Path) -> Path: def _parse_local_config_raw(raw: bytes) -> dict: - """Parse localConfig_v2 value from Chromium LevelDB (strip optional prefix byte).""" + """Parse localConfig_v2 payload (strip optional prefix byte).""" if not raw: raise ValueError("localConfig_v2 is empty") if raw[0:1] in (b"\x00", b"\x01"): @@ -88,14 +85,13 @@ def _parse_local_config_raw(raw: bytes) -> dict: def _read_leveldb_value(leveldb_dir: Path, key: bytes) -> bytes | None: - """Read a single key from LevelDB; copy to temp dir if the database is locked.""" + """Read a single key from local storage; copy to temp dir if locked.""" try: import plyvel except ImportError: logger.warning( - "plyvel is not installed; cannot read Chrome LevelDB at %s. " - "Install libleveldb-dev and plyvel (Linux/macOS), or use WSL/Docker for " - "extract_slack_tokens.", + "plyvel is not installed; cannot read session storage at %s. " + "See .env.example for supported environments.", leveldb_dir, ) return None @@ -133,7 +129,7 @@ def _read_leveldb_value(leveldb_dir: Path, key: bytes) -> bytes | None: def _read_local_config_v2(profile_root: Path) -> dict | None: - """Load and parse localConfig_v2 from the Chrome profile LevelDB.""" + """Load and parse localConfig_v2 from session storage.""" leveldb_dir = _leveldb_path(profile_root) if not leveldb_dir.is_dir(): logger.warning("LevelDB not found at %s", leveldb_dir) @@ -153,18 +149,14 @@ def _read_local_config_v2(profile_root: Path) -> dict | None: def _chrome_linux_v10_cookie_key() -> bytes: - """AES key for Chromium v10 cookies on Linux (slack-chromium / headless Chrome).""" + """AES key for Chromium v10 encrypted session values on Linux.""" from Cryptodome.Protocol.KDF import PBKDF2 return PBKDF2(b"peanuts", b"saltysalt", dkLen=16, count=1) def _decrypt_chrome_linux_v10_cookie(encrypted_value: bytes) -> str: - """ - Decrypt Chromium v10 cookie blobs written by Linux Chrome (AES-128-CBC). - - Profiles from slack-chromium use this format; browser_cookie3 often fails there. - """ + """Decrypt Chromium v10 encrypted session blobs (AES-128-CBC).""" if not encrypted_value.startswith(b"v10"): raise ValueError("unsupported Chrome cookie encryption (expected v10 prefix)") from Cryptodome.Cipher import AES @@ -178,7 +170,7 @@ def _decrypt_chrome_linux_v10_cookie(encrypted_value: bytes) -> str: def _read_xoxd_cookie_from_sqlite(cookies_file: Path) -> str | None: - """Read Slack cookie 'd' via SQLite + Linux v10 decryption (slack-chromium profiles).""" + """Read session value from SQLite storage with Linux v10 decryption.""" import sqlite3 conn = sqlite3.connect(f"file:{cookies_file}?mode=ro", uri=True) @@ -205,10 +197,10 @@ def _read_xoxd_cookie_from_sqlite(cookies_file: Path) -> str | None: def _read_xoxd_cookie(profile_root: Path) -> str | None: - """Read Slack session cookie 'd' from the Chrome profile.""" + """Read secondary session credential from configured storage.""" cookies_file = _cookies_path(profile_root) if not cookies_file.is_file(): - logger.warning("Cookies database not found at %s", cookies_file) + logger.warning("Session storage database not found at %s", cookies_file) return None try: import browser_cookie3 @@ -228,22 +220,17 @@ def _read_xoxd_cookie(profile_root: Path) -> str | None: if value: return value except Exception as e: - logger.warning("Error reading cookie 'd' from SQLite: %s", e) + logger.warning("Error reading session credential from SQLite: %s", e) return None - logger.warning("xoxd token (cookie 'd') not found in %s", cookies_file) + logger.warning("Secondary session credential not found in %s", cookies_file) return None def extract_slack_tokens_from_config( local_config: dict, xoxd: str, team_id: str ) -> dict | None: - """ - Extract xoxc and xoxd tokens from parsed localConfig and cookie value. - - Returns: - dict with xoxc, xoxd, team_id, team_name, user_id or None - """ + """Build session credential dict from parsed localConfig, or None.""" try: teams = local_config.get("teams", {}) team_data = teams.get(team_id) @@ -258,10 +245,10 @@ def extract_slack_tokens_from_config( team_name = team_data.get("name") user_id = team_data.get("user_id") if not xoxc_token: - logger.warning("xoxc token not found in team data") + logger.warning("Primary session credential not found in team data") return None if not xoxd: - logger.warning("xoxd token (cookie 'd') not found") + logger.warning("Secondary session credential not found") return None tokens = { "xoxc": xoxc_token, @@ -270,10 +257,10 @@ def extract_slack_tokens_from_config( "team_name": team_name, "user_id": user_id, } - logger.debug("Tokens extracted for team %s", team_name) + logger.debug("Session credentials loaded for team %s", team_name) return tokens except Exception as e: - logger.warning("Error extracting tokens: %s", e) + logger.warning("Error loading session credentials: %s", e) return None @@ -288,7 +275,7 @@ def get_all_team_ids_from_config(local_config: dict) -> list[str]: def get_all_team_ids(local_config: dict | None = None) -> list[str]: - """Get team IDs from localConfig; reads profile if local_config not provided.""" + """Get team IDs from localConfig; reads workspace storage if not provided.""" if local_config is not None: return get_all_team_ids_from_config(local_config) try: @@ -303,7 +290,7 @@ def get_all_team_ids(local_config: dict | None = None) -> list[str]: def is_slack_internal_token_auth_error(error: str | None) -> bool: - """True if Slack API error indicates expired or invalid xoxc/xoxd session.""" + """True if Slack API error indicates expired or invalid session credentials.""" return (error or "").strip() in SLACK_INTERNAL_TOKEN_AUTH_ERRORS @@ -312,11 +299,7 @@ def probe_slack_internal_tokens( xoxd: str, file_id: str = SLACK_TOKEN_PROBE_FILE_ID, ) -> bool: - """ - Return True if xoxc/xoxd authenticate against Slack files.info. - - Uses a dummy file id: file_not_found and other non-auth errors still mean tokens work. - """ + """Return True if session credentials authenticate against Slack files.info.""" xoxc = (xoxc or "").strip() xoxd = (xoxd or "").strip() if not xoxc or not xoxd: @@ -345,12 +328,8 @@ def probe_slack_internal_tokens( def extract_slack_tokens_auto(team_id: str) -> dict | None: - """ - Read xoxc/xoxd from CHROME_PROFILE_PATH (logged-in Slack session on disk). - - Stop slack-chromium (slack-session profile) before calling to avoid LevelDB locks. - """ - logger.debug("Starting Slack token extraction for team %s", team_id) + """Load session credentials for team_id from configured workspace paths.""" + logger.debug("Loading Slack session credentials for team %s", team_id) try: profile_root = _resolve_chrome_profile_root() except ValueError as e: @@ -358,15 +337,14 @@ def extract_slack_tokens_auto(team_id: str) -> dict | None: return None if not profile_root.is_dir(): logger.error( - "Chrome profile not found at %s. Log in via slack-session (noVNC) or run " - "manage.py extract_slack_tokens after login.", + "Session storage not found at %s. See .env.example.", profile_root, ) return None local_config = _read_local_config_v2(profile_root) if not local_config: logger.error( - "Failed to read localConfig_v2 from profile. Ensure Slack is logged in at app.slack.com." + "Failed to read session configuration from workspace storage. See .env.example." ) return None team_ids = get_all_team_ids_from_config(local_config) @@ -375,12 +353,12 @@ def extract_slack_tokens_auto(team_id: str) -> dict | None: xoxd = _read_xoxd_cookie(profile_root) if not xoxd: logger.error( - "Failed to read cookie 'd'. Profile may be from a different OS or browser still running." + "Failed to read secondary session credential from workspace storage." ) return None - logger.debug("Extracting tokens for team ID: %s", team_id) + logger.debug("Loading session credentials for team ID: %s", team_id) tokens = extract_slack_tokens_from_config(local_config, xoxd, team_id) if tokens: return tokens - logger.warning("Failed to extract tokens for team %s", team_id) + logger.warning("Failed to load session credentials for team %s", team_id) return None diff --git a/slack_event_handler/workspace.py b/slack_event_handler/workspace.py index 91b0b94c..fc5edb3c 100644 --- a/slack_event_handler/workspace.py +++ b/slack_event_handler/workspace.py @@ -3,8 +3,8 @@ Layout: workspace/slack_event_handler/ - data/ (state.json, raw event files) - - chrome_profile/ (Slack login session for xoxc/xoxd extraction) - - slack_internal_tokens.json (xoxc/xoxd tokens, not .env) + - chrome_profile/ (session storage for huddle credentials) + - slack_internal_tokens.json (session credentials, not .env) """ import os @@ -30,14 +30,14 @@ def get_data_dir() -> Path: def get_chrome_profile_path() -> Path: - """Chrome user-data dir for Slack session extraction.""" + """Session storage directory for Slack huddle credentials.""" path = get_workspace_root() / CHROME_PROFILE_DIRNAME path.mkdir(parents=True, exist_ok=True) return path def get_slack_internal_tokens_json_path() -> Path: - """JSON file storing xoxc/xoxd per team.""" + """JSON file storing session credentials per team.""" return get_workspace_root() / SLACK_INTERNAL_TOKENS_FILENAME