From fd3d41318bb718f57690bf73f081ab8139bb3282 Mon Sep 17 00:00:00 2001 From: sebastienlorber Date: Wed, 17 Aug 2022 14:37:55 +0200 Subject: [PATCH 1/2] prepare doc before sitemap bugfix --- website/docs/seo.md | 20 +++++++++++++++++++- website/src/pages/examples/noIndex.md | 25 +++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 website/src/pages/examples/noIndex.md diff --git a/website/docs/seo.md b/website/docs/seo.md index 578bb7761c30..f252889cdf54 100644 --- a/website/docs/seo.md +++ b/website/docs/seo.md @@ -124,7 +124,11 @@ Read more about the robots file in [the Google documentation](https://developers :::caution -**Important**: the `robots.txt` file does **not** prevent HTML pages from being indexed. Use `` as [page metadata](#single-page-metadata) to prevent it from appearing in search results entirely. +**Important**: the `robots.txt` file does **not** prevent HTML pages from being indexed. + +To prevent your whole Docusaurus site from being indexed, use the [`noIndex`](./api/docusaurus.config.js.md#noIndex) site config. Some [hosting providers](./deployment.mdx) may also let you configure a `X-Robots-Tag: noindex` HTTP header (GitHub Pages does not support this). + +To prevent a single page from being indexed, use `` as [page metadata](#single-page-metadata). Read more about the [robots meta tag](https://developers.google.com/search/docs/advanced/robots/robots_meta_tag). ::: @@ -132,6 +136,20 @@ Read more about the robots file in [the Google documentation](https://developers Docusaurus provides the [`@docusaurus/plugin-sitemap`](./api/plugins/plugin-sitemap.md) plugin, which is shipped with `preset-classic` by default. It autogenerates a `sitemap.xml` file which will be available at `https://example.com/[baseUrl]/sitemap.xml` after the production build. This sitemap metadata helps search engine crawlers crawl your site more accurately. +:::tip + +The sitemap plugin automatically filters pages containing a `noindex` [robots meta directive](https://developers.google.com/search/docs/advanced/robots/robots_meta_tag). + +For example, [`/examples/noIndex`](/examples/noIndex) is not included in the [Docusaurus sitemap.xml file](pathname:///sitemap.xml) because it contains the following [page metadata](#single-page-metadata): + +```html + + + +``` + +::: + ## Human readable links {#human-readable-links} Docusaurus uses your file names as links, but you can always change that using slugs, see this [tutorial](./guides/docs/docs-introduction.md#document-id) for more details. diff --git a/website/src/pages/examples/noIndex.md b/website/src/pages/examples/noIndex.md new file mode 100644 index 000000000000..b4df30e2f883 --- /dev/null +++ b/website/src/pages/examples/noIndex.md @@ -0,0 +1,25 @@ +# No Index Page example + + + + + +This page will not be indexed by search engines because it contains the page following [page metadata](/docs/seo#single-page-metadata) markup: + +```html + + + +``` + +:::tip + +The sitemap plugin filters pages containing a `noindex` content value. This page doesn't appear in Docusaurus [sitemap.xml](pathname:///sitemap.xml) file. + +::: + +:::note + +Robots directives are [case-insensitive](https://developers.google.com/search/docs/advanced/robots/robots_meta_tag#directives). + +::: From d9e41c682f674b552af8a97d2831e748be9b5788 Mon Sep 17 00:00:00 2001 From: sebastienlorber Date: Wed, 17 Aug 2022 15:02:03 +0200 Subject: [PATCH 2/2] fix sitemap noindex bug --- .../src/__tests__/createSitemap.test.ts | 5 +- .../src/createSitemap.ts | 53 +++++++++++++++---- 2 files changed, 46 insertions(+), 12 deletions(-) diff --git a/packages/docusaurus-plugin-sitemap/src/__tests__/createSitemap.test.ts b/packages/docusaurus-plugin-sitemap/src/__tests__/createSitemap.test.ts index 9d42f781baaa..8a4fa2ac553d 100644 --- a/packages/docusaurus-plugin-sitemap/src/__tests__/createSitemap.test.ts +++ b/packages/docusaurus-plugin-sitemap/src/__tests__/createSitemap.test.ts @@ -158,7 +158,10 @@ describe('createSitemap', () => { meta: { // @ts-expect-error: bad lib def toComponent: () => [ - React.createElement('meta', {name: 'robots', content: 'noindex'}), + React.createElement('meta', { + name: 'robots', + content: 'NoFolloW, NoiNDeX', + }), ], }, }, diff --git a/packages/docusaurus-plugin-sitemap/src/createSitemap.ts b/packages/docusaurus-plugin-sitemap/src/createSitemap.ts index 536f2ecfc6bd..2c9166739733 100644 --- a/packages/docusaurus-plugin-sitemap/src/createSitemap.ts +++ b/packages/docusaurus-plugin-sitemap/src/createSitemap.ts @@ -13,6 +13,40 @@ import type {DocusaurusConfig} from '@docusaurus/types'; import type {HelmetServerState} from 'react-helmet-async'; import type {PluginOptions} from './options'; +function isNoIndexMetaRoute({ + head, + route, +}: { + head: {[location: string]: HelmetServerState}; + route: string; +}) { + const isNoIndexMetaTag = ({ + name, + content, + }: { + name?: string; + content?: string; + }): boolean => { + if (!name || !content) { + return false; + } + return ( + // meta name is not case-sensitive + name.toLowerCase() === 'robots' && + // Robots directives are not case-sensitive + content.toLowerCase().includes('noindex') + ); + }; + + // https://github.com/staylor/react-helmet-async/pull/167 + const meta = head[route]?.meta.toComponent() as unknown as + | ReactElement<{name?: string; content?: string}>[] + | undefined; + return meta?.some((tag) => + isNoIndexMetaTag({name: tag.props.name, content: tag.props.content}), + ); +} + export default async function createSitemap( siteConfig: DocusaurusConfig, routesPaths: string[], @@ -27,18 +61,15 @@ export default async function createSitemap( const ignoreMatcher = createMatcher(ignorePatterns); - const includedRoutes = routesPaths.filter((route) => { - if (route.endsWith('404.html') || ignoreMatcher(route)) { - return false; - } - // https://github.com/staylor/react-helmet-async/pull/167 - const meta = head[route]?.meta.toComponent() as unknown as - | ReactElement<{name?: string; content?: string}>[] - | undefined; - return !meta?.some( - (tag) => tag.props.name === 'robots' && tag.props.content === 'noindex', + function isRouteExcluded(route: string) { + return ( + route.endsWith('404.html') || + ignoreMatcher(route) || + isNoIndexMetaRoute({head, route}) ); - }); + } + + const includedRoutes = routesPaths.filter((route) => !isRouteExcluded(route)); if (includedRoutes.length === 0) { return null;