From d4efdf2ca0bc1d66dec3f421bb3270e443e674b4 Mon Sep 17 00:00:00 2001 From: Vlad Frangu Date: Mon, 25 May 2026 13:26:53 +0300 Subject: [PATCH 1/3] feat: disallow scraping and force canonical on preview builds Adds a postBuild plugin that, for PR/preview deployments, overwrites robots.txt with a global Disallow and rewrites every page's canonical link to point at docs.apify.com so search engines don't index previews or compete with production URLs. --- docusaurus.config.js | 1 + .../docusaurus-plugin-preview-meta/index.js | 70 +++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 src/plugins/docusaurus-plugin-preview-meta/index.js diff --git a/docusaurus.config.js b/docusaurus.config.js index 567a221477..d6be4d3ace 100644 --- a/docusaurus.config.js +++ b/docusaurus.config.js @@ -255,6 +255,7 @@ module.exports = { allowedInDev: false, }, ], + resolve(__dirname, 'src/plugins/docusaurus-plugin-preview-meta'), () => ({ name: 'webpack-loader-fix', configureWebpack() { diff --git a/src/plugins/docusaurus-plugin-preview-meta/index.js b/src/plugins/docusaurus-plugin-preview-meta/index.js new file mode 100644 index 0000000000..b92e042cbb --- /dev/null +++ b/src/plugins/docusaurus-plugin-preview-meta/index.js @@ -0,0 +1,70 @@ +const fs = require('node:fs/promises'); +const path = require('node:path'); + +const CANONICAL_ORIGIN = 'https://docs.apify.com'; + +function isPreviewBuild() { + const url = process.env.APIFY_DOCS_ABSOLUTE_URL; + if (!url) return false; + try { + const { hostname } = new URL(url); + return hostname.includes('pr-') || hostname.includes('preview'); + } catch { + return false; + } +} + +async function walkHtmlFiles(dir) { + const entries = await fs.readdir(dir, { withFileTypes: true }); + const results = await Promise.all( + entries.map(async (entry) => { + const fullPath = path.join(dir, entry.name); + if (entry.isDirectory()) return walkHtmlFiles(fullPath); + if (entry.isFile() && entry.name.endsWith('.html')) return [fullPath]; + return []; + }), + ); + return results.flat(); +} + +function canonicalUrlForFile(outDir, file) { + const urlPath = path + .relative(outDir, file) + .replace(/\\/g, '/') + .replace(/\/?index\.html$/, '') + .replace(/\.html$/, ''); + return urlPath ? `${CANONICAL_ORIGIN}/${urlPath}` : CANONICAL_ORIGIN; +} + +const CANONICAL_TAG_REGEX = /]+rel=["']canonical["'][^>]*\/?>/i; + +module.exports = function previewMetaPlugin() { + return { + name: 'docusaurus-plugin-preview-meta', + async postBuild({ outDir }) { + if (!isPreviewBuild()) return; + + await fs.writeFile(path.join(outDir, 'robots.txt'), 'User-agent: *\nDisallow: /\n'); + + const htmlFiles = await walkHtmlFiles(outDir); + await Promise.all( + htmlFiles.map(async (file) => { + const content = await fs.readFile(file, 'utf8'); + const canonicalUrl = canonicalUrlForFile(outDir, file); + const canonicalTag = ``; + + let next; + if (CANONICAL_TAG_REGEX.test(content)) { + next = content.replace(CANONICAL_TAG_REGEX, canonicalTag); + } else if (content.includes('')) { + next = content.replace('', `${canonicalTag}`); + } else { + return; + } + + if (next !== content) await fs.writeFile(file, next); + }), + ); + }, + }; +}; From 49d63e9cede266a3bbe55133b6992ebc16ff10c4 Mon Sep 17 00:00:00 2001 From: Vlad Frangu Date: Mon, 25 May 2026 13:32:17 +0300 Subject: [PATCH 2/3] fix: handle SWC-minified HTML when rewriting preview canonicals The original regex required quotes around `canonical` and the fallback inserted before ``. Both forms are stripped by the SWC HTML minimizer, so neither branch fired against real build output and the canonical rewrite shipped as dead code. Make the regex quote-optional, add a `]+rel=["']canonical["'][^>]*\/?>/i; +// SWC's HTML minifier strips quotes from attributes and omits , so the regex must be +// quote-optional and the fallback must target . +const CANONICAL_TAG_REGEX = /]*?\brel=["']?canonical["']?[^>]*>/i; module.exports = function previewMetaPlugin() { return { @@ -58,6 +59,8 @@ module.exports = function previewMetaPlugin() { next = content.replace(CANONICAL_TAG_REGEX, canonicalTag); } else if (content.includes('')) { next = content.replace('', `${canonicalTag}`); + } else if (content.includes(' Date: Mon, 25 May 2026 20:40:34 +0300 Subject: [PATCH 3/3] fix(nginx): noindex preview hostnames at the edge (#2569) --- nginx.conf | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/nginx.conf b/nginx.conf index 1b8bb1b244..4d40e8cad5 100644 --- a/nginx.conf +++ b/nginx.conf @@ -810,6 +810,12 @@ server { resolver 172.20.0.10; server_name ~^(?[^.]+)\.preview\.docs\.apify\.com$; + # Block search indexing on every response from preview hostnames, including the + # apify.github.io proxy_passes (SDK/Client/CLI) that have no preview bucket and + # would otherwise serve production content under the preview hostname. + # `always` keeps the header on 4xx/5xx so error pages aren't indexed either. + add_header X-Robots-Tag "noindex, nofollow, noarchive" always; + # add trailing slashes to the root of GH pages docs rewrite ^/api/client/js$ /api/client/js/ redirect; rewrite ^/api/client/python$ /api/client/python/ redirect;