aboutsummaryrefslogtreecommitdiff
path: root/packages/integrations/sitemap/src
diff options
context:
space:
mode:
Diffstat (limited to 'packages/integrations/sitemap/src')
-rw-r--r--packages/integrations/sitemap/src/config-defaults.ts6
-rw-r--r--packages/integrations/sitemap/src/generate-sitemap.ts77
-rw-r--r--packages/integrations/sitemap/src/index.ts195
-rw-r--r--packages/integrations/sitemap/src/schema.ts41
-rw-r--r--packages/integrations/sitemap/src/utils/parse-i18n-url.ts42
-rw-r--r--packages/integrations/sitemap/src/validate-options.ts22
-rw-r--r--packages/integrations/sitemap/src/write-sitemap.ts75
7 files changed, 458 insertions, 0 deletions
diff --git a/packages/integrations/sitemap/src/config-defaults.ts b/packages/integrations/sitemap/src/config-defaults.ts
new file mode 100644
index 000000000..8d854c7a9
--- /dev/null
+++ b/packages/integrations/sitemap/src/config-defaults.ts
@@ -0,0 +1,6 @@
+import type { SitemapOptions } from './index.js';
+
+export const SITEMAP_CONFIG_DEFAULTS = {
+ filenameBase: 'sitemap',
+ entryLimit: 45000,
+} satisfies SitemapOptions;
diff --git a/packages/integrations/sitemap/src/generate-sitemap.ts b/packages/integrations/sitemap/src/generate-sitemap.ts
new file mode 100644
index 000000000..0fb096cc9
--- /dev/null
+++ b/packages/integrations/sitemap/src/generate-sitemap.ts
@@ -0,0 +1,77 @@
+import type { EnumChangefreq } from 'sitemap';
+import type { SitemapItem, SitemapOptions } from './index.js';
+import { parseI18nUrl } from './utils/parse-i18n-url.js';
+
+/** Construct sitemap.xml given a set of URLs */
+export function generateSitemap(pages: string[], finalSiteUrl: string, opts?: SitemapOptions) {
+ const { changefreq, priority, lastmod: lastmodSrc, i18n } = opts ?? {};
+ // TODO: find way to respect <link rel="canonical"> URLs here
+ const urls = [...pages];
+ urls.sort((a, b) => a.localeCompare(b, 'en', { numeric: true })); // sort alphabetically so sitemap is same each time
+
+ const lastmod = lastmodSrc?.toISOString();
+
+ // Parse URLs for i18n matching later
+ const { defaultLocale, locales } = i18n ?? {};
+ let getI18nLinks: GetI18nLinks | undefined;
+ if (defaultLocale && locales) {
+ getI18nLinks = createGetI18nLinks(urls, defaultLocale, locales, finalSiteUrl);
+ }
+
+ const urlData: SitemapItem[] = urls.map((url, i) => ({
+ url,
+ links: getI18nLinks?.(i),
+ lastmod,
+ priority,
+ changefreq: changefreq as EnumChangefreq,
+ }));
+
+ return urlData;
+}
+
+type GetI18nLinks = (urlIndex: number) => SitemapItem['links'] | undefined;
+
+function createGetI18nLinks(
+ urls: string[],
+ defaultLocale: string,
+ locales: Record<string, string>,
+ finalSiteUrl: string,
+): GetI18nLinks {
+ // `parsedI18nUrls` will have the same length as `urls`, matching correspondingly
+ const parsedI18nUrls = urls.map((url) => parseI18nUrl(url, defaultLocale, locales, finalSiteUrl));
+ // Cache as multiple i18n URLs with the same path will have the same links
+ const i18nPathToLinksCache = new Map<string, SitemapItem['links']>();
+
+ return (urlIndex) => {
+ const i18nUrl = parsedI18nUrls[urlIndex];
+ if (!i18nUrl) {
+ return undefined;
+ }
+
+ const cached = i18nPathToLinksCache.get(i18nUrl.path);
+ if (cached) {
+ return cached;
+ }
+
+ // Find all URLs with the same path (without the locale part), e.g. /en/foo and /es/foo
+ const links: NonNullable<SitemapItem['links']> = [];
+ for (let i = 0; i < parsedI18nUrls.length; i++) {
+ const parsed = parsedI18nUrls[i];
+ if (parsed?.path === i18nUrl.path) {
+ links.push({
+ url: urls[i],
+ lang: locales[parsed.locale],
+ });
+ }
+ }
+
+ // If 0 or 1 (which is itself), return undefined to not create any links.
+ // We also don't need to cache this as we know there's no other URLs that would've match this.
+ if (links.length <= 1) {
+ return undefined;
+ }
+
+ i18nPathToLinksCache.set(i18nUrl.path, links);
+ return links;
+ };
+}
diff --git a/packages/integrations/sitemap/src/index.ts b/packages/integrations/sitemap/src/index.ts
new file mode 100644
index 000000000..078f78abb
--- /dev/null
+++ b/packages/integrations/sitemap/src/index.ts
@@ -0,0 +1,195 @@
+import path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import type { AstroConfig, AstroIntegration } from 'astro';
+import type { EnumChangefreq, LinkItem as LinkItemBase, SitemapItemLoose } from 'sitemap';
+import { ZodError } from 'zod';
+
+import { generateSitemap } from './generate-sitemap.js';
+import { validateOptions } from './validate-options.js';
+import { writeSitemap } from './write-sitemap.js';
+
+export { EnumChangefreq as ChangeFreqEnum } from 'sitemap';
+export type ChangeFreq = `${EnumChangefreq}`;
+export type SitemapItem = Pick<
+ SitemapItemLoose,
+ 'url' | 'lastmod' | 'changefreq' | 'priority' | 'links'
+>;
+export type LinkItem = LinkItemBase;
+
+export type SitemapOptions =
+ | {
+ filenameBase?: string;
+ filter?(page: string): boolean;
+ customPages?: string[];
+
+ i18n?: {
+ defaultLocale: string;
+ locales: Record<string, string>;
+ };
+ // number of entries per sitemap file
+ entryLimit?: number;
+
+ // sitemap specific
+ changefreq?: ChangeFreq;
+ lastmod?: Date;
+ priority?: number;
+
+ // called for each sitemap item just before to save them on disk, sync or async
+ serialize?(item: SitemapItem): SitemapItem | Promise<SitemapItem | undefined> | undefined;
+
+ xslURL?: string;
+ }
+ | undefined;
+
+function formatConfigErrorMessage(err: ZodError) {
+ const errorList = err.issues.map((issue) => ` ${issue.path.join('.')} ${issue.message + '.'}`);
+ return errorList.join('\n');
+}
+
+const PKG_NAME = '@astrojs/sitemap';
+const STATUS_CODE_PAGES = new Set(['404', '500']);
+
+const isStatusCodePage = (locales: string[]) => {
+ const statusPathNames = new Set(
+ locales
+ .flatMap((locale) => [...STATUS_CODE_PAGES].map((page) => `${locale}/${page}`))
+ .concat([...STATUS_CODE_PAGES]),
+ );
+
+ return (pathname: string): boolean => {
+ if (pathname.endsWith('/')) {
+ pathname = pathname.slice(0, -1);
+ }
+ if (pathname.startsWith('/')) {
+ pathname = pathname.slice(1);
+ }
+ return statusPathNames.has(pathname);
+ };
+};
+const createPlugin = (options?: SitemapOptions): AstroIntegration => {
+ let config: AstroConfig;
+
+ return {
+ name: PKG_NAME,
+
+ hooks: {
+ 'astro:config:done': async ({ config: cfg }) => {
+ config = cfg;
+ },
+
+ 'astro:build:done': async ({ dir, routes, pages, logger }) => {
+ try {
+ if (!config.site) {
+ logger.warn(
+ 'The Sitemap integration requires the `site` astro.config option. Skipping.',
+ );
+ return;
+ }
+
+ const opts = validateOptions(config.site, options);
+
+ const { filenameBase, filter, customPages, serialize, entryLimit } = opts;
+
+ const outFile = `${filenameBase}-index.xml`;
+ const finalSiteUrl = new URL(config.base, config.site);
+ const shouldIgnoreStatus = isStatusCodePage(Object.keys(opts.i18n?.locales ?? {}));
+ let pageUrls = pages
+ .filter((p) => !shouldIgnoreStatus(p.pathname))
+ .map((p) => {
+ if (p.pathname !== '' && !finalSiteUrl.pathname.endsWith('/'))
+ finalSiteUrl.pathname += '/';
+ if (p.pathname.startsWith('/')) p.pathname = p.pathname.slice(1);
+ const fullPath = finalSiteUrl.pathname + p.pathname;
+ return new URL(fullPath, finalSiteUrl).href;
+ });
+
+ const routeUrls = routes.reduce<string[]>((urls, r) => {
+ // Only expose pages, not endpoints or redirects
+ if (r.type !== 'page') return urls;
+
+ /**
+ * Dynamic URLs have entries with `undefined` pathnames
+ */
+ if (r.pathname) {
+ if (shouldIgnoreStatus(r.pathname ?? r.route)) return urls;
+
+ // `finalSiteUrl` may end with a trailing slash
+ // or not because of base paths.
+ let fullPath = finalSiteUrl.pathname;
+ if (fullPath.endsWith('/')) fullPath += r.generate(r.pathname).substring(1);
+ else fullPath += r.generate(r.pathname);
+
+ const newUrl = new URL(fullPath, finalSiteUrl).href;
+
+ if (config.trailingSlash === 'never') {
+ urls.push(newUrl);
+ } else if (config.build.format === 'directory' && !newUrl.endsWith('/')) {
+ urls.push(newUrl + '/');
+ } else {
+ urls.push(newUrl);
+ }
+ }
+
+ return urls;
+ }, []);
+
+ pageUrls = Array.from(new Set([...pageUrls, ...routeUrls, ...(customPages ?? [])]));
+
+ if (filter) {
+ pageUrls = pageUrls.filter(filter);
+ }
+
+ if (pageUrls.length === 0) {
+ logger.warn(`No pages found!\n\`${outFile}\` not created.`);
+ return;
+ }
+
+ let urlData = generateSitemap(pageUrls, finalSiteUrl.href, opts);
+
+ if (serialize) {
+ try {
+ const serializedUrls: SitemapItem[] = [];
+ for (const item of urlData) {
+ const serialized = await Promise.resolve(serialize(item));
+ if (serialized) {
+ serializedUrls.push(serialized);
+ }
+ }
+ if (serializedUrls.length === 0) {
+ logger.warn('No pages found!');
+ return;
+ }
+ urlData = serializedUrls;
+ } catch (err) {
+ logger.error(`Error serializing pages\n${(err as any).toString()}`);
+ return;
+ }
+ }
+ const destDir = fileURLToPath(dir);
+ const xslURL = opts.xslURL ? new URL(opts.xslURL, finalSiteUrl).href : undefined;
+ await writeSitemap(
+ {
+ filenameBase: filenameBase,
+ hostname: finalSiteUrl.href,
+ destinationDir: destDir,
+ publicBasePath: config.base,
+ sourceData: urlData,
+ limit: entryLimit,
+ xslURL: xslURL,
+ },
+ config,
+ );
+ logger.info(`\`${outFile}\` created at \`${path.relative(process.cwd(), destDir)}\``);
+ } catch (err) {
+ if (err instanceof ZodError) {
+ logger.warn(formatConfigErrorMessage(err));
+ } else {
+ throw err;
+ }
+ }
+ },
+ },
+ };
+};
+
+export default createPlugin;
diff --git a/packages/integrations/sitemap/src/schema.ts b/packages/integrations/sitemap/src/schema.ts
new file mode 100644
index 000000000..0ab9d672d
--- /dev/null
+++ b/packages/integrations/sitemap/src/schema.ts
@@ -0,0 +1,41 @@
+import { EnumChangefreq as ChangeFreq } from 'sitemap';
+import { z } from 'zod';
+import { SITEMAP_CONFIG_DEFAULTS } from './config-defaults.js';
+
+const localeKeySchema = z.string().min(1);
+
+export const SitemapOptionsSchema = z
+ .object({
+ filenameBase: z.string().optional().default(SITEMAP_CONFIG_DEFAULTS.filenameBase),
+ filter: z.function().args(z.string()).returns(z.boolean()).optional(),
+ customPages: z.string().url().array().optional(),
+ canonicalURL: z.string().url().optional(),
+ xslURL: z.string().optional(),
+
+ i18n: z
+ .object({
+ defaultLocale: localeKeySchema,
+ locales: z.record(
+ localeKeySchema,
+ z
+ .string()
+ .min(2)
+ .regex(/^[a-zA-Z\-]+$/gm, {
+ message: 'Only English alphabet symbols and hyphen allowed',
+ }),
+ ),
+ })
+ .refine((val) => !val || val.locales[val.defaultLocale], {
+ message: '`defaultLocale` must exist in `locales` keys',
+ })
+ .optional(),
+
+ entryLimit: z.number().nonnegative().optional().default(SITEMAP_CONFIG_DEFAULTS.entryLimit),
+ serialize: z.function().args(z.any()).returns(z.any()).optional(),
+
+ changefreq: z.nativeEnum(ChangeFreq).optional(),
+ lastmod: z.date().optional(),
+ priority: z.number().min(0).max(1).optional(),
+ })
+ .strict()
+ .default(SITEMAP_CONFIG_DEFAULTS);
diff --git a/packages/integrations/sitemap/src/utils/parse-i18n-url.ts b/packages/integrations/sitemap/src/utils/parse-i18n-url.ts
new file mode 100644
index 000000000..86221ca9d
--- /dev/null
+++ b/packages/integrations/sitemap/src/utils/parse-i18n-url.ts
@@ -0,0 +1,42 @@
+interface ParsedI18nUrl {
+ locale: string;
+ path: string;
+}
+
+// NOTE: The parameters have been schema-validated with Zod
+export function parseI18nUrl(
+ url: string,
+ defaultLocale: string,
+ locales: Record<string, string>,
+ base: string,
+): ParsedI18nUrl | undefined {
+ if (!url.startsWith(base)) {
+ return undefined;
+ }
+
+ let s = url.slice(base.length);
+
+ // Handle root URL
+ if (!s || s === '/') {
+ return { locale: defaultLocale, path: '/' };
+ }
+
+ if (s[0] !== '/') {
+ s = '/' + s;
+ }
+
+ // Get locale from path, e.g.
+ // "/en-US/" -> "en-US"
+ // "/en-US/foo" -> "en-US"
+ const locale = s.split('/')[1];
+ if (locale in locales) {
+ // "/en-US/foo" -> "/foo"
+ let path = s.slice(1 + locale.length);
+ if (!path) {
+ path = '/';
+ }
+ return { locale, path };
+ }
+
+ return { locale: defaultLocale, path: s };
+}
diff --git a/packages/integrations/sitemap/src/validate-options.ts b/packages/integrations/sitemap/src/validate-options.ts
new file mode 100644
index 000000000..f51750ff5
--- /dev/null
+++ b/packages/integrations/sitemap/src/validate-options.ts
@@ -0,0 +1,22 @@
+import { z } from 'zod';
+import type { SitemapOptions } from './index.js';
+import { SitemapOptionsSchema } from './schema.js';
+
+// @internal
+export const validateOptions = (site: string | undefined, opts: SitemapOptions) => {
+ const result = SitemapOptionsSchema.parse(opts);
+
+ z.object({
+ site: z.string().optional(), // Astro takes care of `site`: how to validate, transform and refine
+ canonicalURL: z.string().optional(), // `canonicalURL` is already validated in prev step
+ })
+ .refine((options) => options.site || options.canonicalURL, {
+ message: 'Required `site` astro.config option or `canonicalURL` integration option',
+ })
+ .parse({
+ site,
+ canonicalURL: result.canonicalURL,
+ });
+
+ return result;
+};
diff --git a/packages/integrations/sitemap/src/write-sitemap.ts b/packages/integrations/sitemap/src/write-sitemap.ts
new file mode 100644
index 000000000..939bd91be
--- /dev/null
+++ b/packages/integrations/sitemap/src/write-sitemap.ts
@@ -0,0 +1,75 @@
+import { type WriteStream, createWriteStream } from 'node:fs';
+import { mkdir } from 'node:fs/promises';
+import { normalize, resolve } from 'node:path';
+import { Readable, pipeline } from 'node:stream';
+import { promisify } from 'node:util';
+import replace from 'stream-replace-string';
+
+import { SitemapAndIndexStream, SitemapStream } from 'sitemap';
+
+import type { AstroConfig } from 'astro';
+import type { SitemapItem } from './index.js';
+
+type WriteSitemapConfig = {
+ filenameBase: string;
+ hostname: string;
+ sitemapHostname?: string;
+ sourceData: SitemapItem[];
+ destinationDir: string;
+ publicBasePath?: string;
+ limit?: number;
+ xslURL?: string;
+};
+
+// adapted from sitemap.js/sitemap-simple
+export async function writeSitemap(
+ {
+ filenameBase,
+ hostname,
+ sitemapHostname = hostname,
+ sourceData,
+ destinationDir,
+ limit = 50000,
+ publicBasePath = './',
+ xslURL: xslUrl,
+ }: WriteSitemapConfig,
+ astroConfig: AstroConfig,
+) {
+ await mkdir(destinationDir, { recursive: true });
+
+ const sitemapAndIndexStream = new SitemapAndIndexStream({
+ limit,
+ xslUrl,
+ getSitemapStream: (i) => {
+ const sitemapStream = new SitemapStream({
+ hostname,
+ xslUrl,
+ });
+ const path = `./${filenameBase}-${i}.xml`;
+ const writePath = resolve(destinationDir, path);
+ if (!publicBasePath.endsWith('/')) {
+ publicBasePath += '/';
+ }
+ const publicPath = normalize(publicBasePath + path);
+
+ let stream: WriteStream;
+ if (astroConfig.trailingSlash === 'never' || astroConfig.build.format === 'file') {
+ // workaround for trailing slash issue in sitemap.js: https://github.com/ekalinin/sitemap.js/issues/403
+ const host = hostname.endsWith('/') ? hostname.slice(0, -1) : hostname;
+ const searchStr = `<loc>${host}/</loc>`;
+ const replaceStr = `<loc>${host}</loc>`;
+ stream = sitemapStream
+ .pipe(replace(searchStr, replaceStr))
+ .pipe(createWriteStream(writePath));
+ } else {
+ stream = sitemapStream.pipe(createWriteStream(writePath));
+ }
+
+ return [new URL(publicPath, sitemapHostname).toString(), sitemapStream, stream];
+ },
+ });
+
+ const src = Readable.from(sourceData);
+ const indexPath = resolve(destinationDir, `./${filenameBase}-index.xml`);
+ return promisify(pipeline)(src, sitemapAndIndexStream, createWriteStream(indexPath));
+}