diff --git a/docusaurus.config.js b/docusaurus.config.js index 567a221477..d6be4d3ace 100644 --- a/docusaurus.config.js +++ b/docusaurus.config.js @@ -255,6 +255,7 @@ module.exports = { allowedInDev: false, }, ], + resolve(__dirname, 'src/plugins/docusaurus-plugin-preview-meta'), () => ({ name: 'webpack-loader-fix', configureWebpack() { diff --git a/nginx.conf b/nginx.conf index 1b8bb1b244..4d40e8cad5 100644 --- a/nginx.conf +++ b/nginx.conf @@ -810,6 +810,12 @@ server { resolver 172.20.0.10; server_name ~^(?[^.]+)\.preview\.docs\.apify\.com$; + # Block search indexing on every response from preview hostnames, including the + # apify.github.io proxy_passes (SDK/Client/CLI) that have no preview bucket and + # would otherwise serve production content under the preview hostname. + # `always` keeps the header on 4xx/5xx so error pages aren't indexed either. + add_header X-Robots-Tag "noindex, nofollow, noarchive" always; + # add trailing slashes to the root of GH pages docs rewrite ^/api/client/js$ /api/client/js/ redirect; rewrite ^/api/client/python$ /api/client/python/ redirect; diff --git a/src/plugins/docusaurus-plugin-preview-meta/index.js b/src/plugins/docusaurus-plugin-preview-meta/index.js new file mode 100644 index 0000000000..878f3bf3e0 --- /dev/null +++ b/src/plugins/docusaurus-plugin-preview-meta/index.js @@ -0,0 +1,73 @@ +const fs = require('node:fs/promises'); +const path = require('node:path'); + +const CANONICAL_ORIGIN = 'https://docs.apify.com'; + +function isPreviewBuild() { + const url = process.env.APIFY_DOCS_ABSOLUTE_URL; + if (!url) return false; + try { + const { hostname } = new URL(url); + return hostname.includes('pr-') || hostname.includes('preview'); + } catch { + return false; + } +} + +async function walkHtmlFiles(dir) { + const entries = await fs.readdir(dir, { withFileTypes: true }); + const results = await Promise.all( + entries.map(async (entry) => { + const fullPath = path.join(dir, entry.name); + if (entry.isDirectory()) return walkHtmlFiles(fullPath); + if (entry.isFile() && entry.name.endsWith('.html')) return [fullPath]; + return []; + }), + ); + return results.flat(); +} + +function canonicalUrlForFile(outDir, file) { + const relative = path.relative(outDir, file).replace(/\\/g, '/'); + // Docusaurus serves 404.html at the literal /404.html path; preserve it for parity with production. + if (relative === '404.html') return `${CANONICAL_ORIGIN}/404.html`; + const urlPath = relative.replace(/\/?index\.html$/, '').replace(/\.html$/, ''); + return urlPath ? `${CANONICAL_ORIGIN}/${urlPath}` : CANONICAL_ORIGIN; +} + +// SWC's HTML minifier strips quotes from attributes and omits , so the regex must be +// quote-optional and the fallback must target . +const CANONICAL_TAG_REGEX = /]*?\brel=["']?canonical["']?[^>]*>/i; + +module.exports = function previewMetaPlugin() { + return { + name: 'docusaurus-plugin-preview-meta', + async postBuild({ outDir }) { + if (!isPreviewBuild()) return; + + await fs.writeFile(path.join(outDir, 'robots.txt'), 'User-agent: *\nDisallow: /\n'); + + const htmlFiles = await walkHtmlFiles(outDir); + await Promise.all( + htmlFiles.map(async (file) => { + const content = await fs.readFile(file, 'utf8'); + const canonicalUrl = canonicalUrlForFile(outDir, file); + const canonicalTag = ``; + + let next; + if (CANONICAL_TAG_REGEX.test(content)) { + next = content.replace(CANONICAL_TAG_REGEX, canonicalTag); + } else if (content.includes('')) { + next = content.replace('', `${canonicalTag}`); + } else if (content.includes('