From fd3d41318bb718f57690bf73f081ab8139bb3282 Mon Sep 17 00:00:00 2001
From: sebastienlorber <lorber.sebastien@gmail.com>
Date: Wed, 17 Aug 2022 14:37:55 +0200
Subject: [PATCH 1/2] prepare doc before sitemap bugfix

---
 website/docs/seo.md                   | 20 +++++++++++++++++++-
 website/src/pages/examples/noIndex.md | 25 +++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 website/src/pages/examples/noIndex.md
diff --git a/website/docs/seo.md b/website/docs/seo.md
index 578bb7761c30..f252889cdf54 100644
--- a/website/docs/seo.md
+++ b/website/docs/seo.md
@@ -124,7 +124,11 @@ Read more about the robots file in [the Google documentation](https://developers
 
 :::caution
 
-**Important**: the `robots.txt` file does **not** prevent HTML pages from being indexed. Use `<meta name="robots" content="noindex">` as [page metadata](#single-page-metadata) to prevent it from appearing in search results entirely.
+**Important**: the `robots.txt` file does **not** prevent HTML pages from being indexed.
+
+To prevent your whole Docusaurus site from being indexed, use the [`noIndex`](./api/docusaurus.config.js.md#noIndex) site config. Some [hosting providers](./deployment.mdx) may also let you configure a `X-Robots-Tag: noindex` HTTP header (GitHub Pages does not support this).
+
+To prevent a single page from being indexed, use `<meta name="robots" content="noindex">` as [page metadata](#single-page-metadata). Read more about the [robots meta tag](https://developers.google.com/search/docs/advanced/robots/robots_meta_tag).
 
 :::
 
@@ -132,6 +136,20 @@ Read more about the robots file in [the Google documentation](https://developers
 
 Docusaurus provides the [`@docusaurus/plugin-sitemap`](./api/plugins/plugin-sitemap.md) plugin, which is shipped with `preset-classic` by default. It autogenerates a `sitemap.xml` file which will be available at `https://example.com/[baseUrl]/sitemap.xml` after the production build. This sitemap metadata helps search engine crawlers crawl your site more accurately.
 
+:::tip
+
+The sitemap plugin automatically filters pages containing a `noindex` [robots meta directive](https://developers.google.com/search/docs/advanced/robots/robots_meta_tag).
+
+For example, [`/examples/noIndex`](/examples/noIndex) is not included in the [Docusaurus sitemap.xml file](pathname:///sitemap.xml) because it contains the following [page metadata](#single-page-metadata):
+
+```html
+<head>
+  <meta name="robots" content="noindex, nofollow" />
+</head>
+```
+
+:::
+
 ## Human readable links {#human-readable-links}
 
 Docusaurus uses your file names as links, but you can always change that using slugs, see this [tutorial](./guides/docs/docs-introduction.md#document-id) for more details.
diff --git a/website/src/pages/examples/noIndex.md b/website/src/pages/examples/noIndex.md
new file mode 100644
index 000000000000..b4df30e2f883
--- /dev/null
+++ b/website/src/pages/examples/noIndex.md
@@ -0,0 +1,25 @@
+# No Index Page example
+
+<head>
+  <meta name="robots" content="nOiNdeX, NoFolLoW" />
+</head>
+
+This page will not be indexed by search engines because it contains the page following [page metadata](/docs/seo#single-page-metadata) markup:
+
+```html
+<head>
+  <meta name="robots" content="noindex, nofollow" />
+</head>
+```
+
+:::tip
+
+The sitemap plugin filters pages containing a `noindex` content value. This page doesn't appear in Docusaurus [sitemap.xml](pathname:///sitemap.xml) file.
+
+:::
+
+:::note
+
+Robots directives are [case-insensitive](https://developers.google.com/search/docs/advanced/robots/robots_meta_tag#directives).
+
+:::

From d9e41c682f674b552af8a97d2831e748be9b5788 Mon Sep 17 00:00:00 2001
From: sebastienlorber <lorber.sebastien@gmail.com>
Date: Wed, 17 Aug 2022 15:02:03 +0200
Subject: [PATCH 2/2] fix sitemap noindex bug

---
 .../src/__tests__/createSitemap.test.ts       |  5 +-
 .../src/createSitemap.ts                      | 53 +++++++++++++++----
 2 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/packages/docusaurus-plugin-sitemap/src/__tests__/createSitemap.test.ts b/packages/docusaurus-plugin-sitemap/src/__tests__/createSitemap.test.ts
index 9d42f781baaa..8a4fa2ac553d 100644
--- a/packages/docusaurus-plugin-sitemap/src/__tests__/createSitemap.test.ts
+++ b/packages/docusaurus-plugin-sitemap/src/__tests__/createSitemap.test.ts
@@ -158,7 +158,10 @@ describe('createSitemap', () => {
           meta: {
             // @ts-expect-error: bad lib def
             toComponent: () => [
-              React.createElement('meta', {name: 'robots', content: 'noindex'}),
+              React.createElement('meta', {
+                name: 'robots',
+                content: 'NoFolloW, NoiNDeX',
+              }),
             ],
           },
         },
diff --git a/packages/docusaurus-plugin-sitemap/src/createSitemap.ts b/packages/docusaurus-plugin-sitemap/src/createSitemap.ts
index 536f2ecfc6bd..2c9166739733 100644
--- a/packages/docusaurus-plugin-sitemap/src/createSitemap.ts
+++ b/packages/docusaurus-plugin-sitemap/src/createSitemap.ts
@@ -13,6 +13,40 @@ import type {DocusaurusConfig} from '@docusaurus/types';
 import type {HelmetServerState} from 'react-helmet-async';
 import type {PluginOptions} from './options';
 
+function isNoIndexMetaRoute({
+  head,
+  route,
+}: {
+  head: {[location: string]: HelmetServerState};
+  route: string;
+}) {
+  const isNoIndexMetaTag = ({
+    name,
+    content,
+  }: {
+    name?: string;
+    content?: string;
+  }): boolean => {
+    if (!name || !content) {
+      return false;
+    }
+    return (
+      // meta name is not case-sensitive
+      name.toLowerCase() === 'robots' &&
+      // Robots directives are not case-sensitive
+      content.toLowerCase().includes('noindex')
+    );
+  };
+
+  // https://github.com/staylor/react-helmet-async/pull/167
+  const meta = head[route]?.meta.toComponent() as unknown as
+    | ReactElement<{name?: string; content?: string}>[]
+    | undefined;
+  return meta?.some((tag) =>
+    isNoIndexMetaTag({name: tag.props.name, content: tag.props.content}),
+  );
+}
+
 export default async function createSitemap(
   siteConfig: DocusaurusConfig,
   routesPaths: string[],
@@ -27,18 +61,15 @@ export default async function createSitemap(
 
   const ignoreMatcher = createMatcher(ignorePatterns);
 
-  const includedRoutes = routesPaths.filter((route) => {
-    if (route.endsWith('404.html') || ignoreMatcher(route)) {
-      return false;
-    }
-    // https://github.com/staylor/react-helmet-async/pull/167
-    const meta = head[route]?.meta.toComponent() as unknown as
-      | ReactElement<{name?: string; content?: string}>[]
-      | undefined;
-    return !meta?.some(
-      (tag) => tag.props.name === 'robots' && tag.props.content === 'noindex',
+  function isRouteExcluded(route: string) {
+    return (
+      route.endsWith('404.html') ||
+      ignoreMatcher(route) ||
+      isNoIndexMetaRoute({head, route})
     );
-  });
+  }
+
+  const includedRoutes = routesPaths.filter((route) => !isRouteExcluded(route));
 
   if (includedRoutes.length === 0) {
     return null;