diff options
Diffstat (limited to 'packages/integrations/sitemap/src')
-rw-r--r-- | packages/integrations/sitemap/src/config-defaults.ts | 6 | ||||
-rw-r--r-- | packages/integrations/sitemap/src/generate-sitemap.ts | 77 | ||||
-rw-r--r-- | packages/integrations/sitemap/src/index.ts | 195 | ||||
-rw-r--r-- | packages/integrations/sitemap/src/schema.ts | 41 | ||||
-rw-r--r-- | packages/integrations/sitemap/src/utils/parse-i18n-url.ts | 42 | ||||
-rw-r--r-- | packages/integrations/sitemap/src/validate-options.ts | 22 | ||||
-rw-r--r-- | packages/integrations/sitemap/src/write-sitemap.ts | 75 |
7 files changed, 458 insertions, 0 deletions
diff --git a/packages/integrations/sitemap/src/config-defaults.ts b/packages/integrations/sitemap/src/config-defaults.ts new file mode 100644 index 000000000..8d854c7a9 --- /dev/null +++ b/packages/integrations/sitemap/src/config-defaults.ts @@ -0,0 +1,6 @@ +import type { SitemapOptions } from './index.js'; + +export const SITEMAP_CONFIG_DEFAULTS = { + filenameBase: 'sitemap', + entryLimit: 45000, +} satisfies SitemapOptions; diff --git a/packages/integrations/sitemap/src/generate-sitemap.ts b/packages/integrations/sitemap/src/generate-sitemap.ts new file mode 100644 index 000000000..0fb096cc9 --- /dev/null +++ b/packages/integrations/sitemap/src/generate-sitemap.ts @@ -0,0 +1,77 @@ +import type { EnumChangefreq } from 'sitemap'; +import type { SitemapItem, SitemapOptions } from './index.js'; +import { parseI18nUrl } from './utils/parse-i18n-url.js'; + +/** Construct sitemap.xml given a set of URLs */ +export function generateSitemap(pages: string[], finalSiteUrl: string, opts?: SitemapOptions) { + const { changefreq, priority, lastmod: lastmodSrc, i18n } = opts ?? {}; + // TODO: find way to respect <link rel="canonical"> URLs here + const urls = [...pages]; + urls.sort((a, b) => a.localeCompare(b, 'en', { numeric: true })); // sort alphabetically so sitemap is same each time + + const lastmod = lastmodSrc?.toISOString(); + + // Parse URLs for i18n matching later + const { defaultLocale, locales } = i18n ?? {}; + let getI18nLinks: GetI18nLinks | undefined; + if (defaultLocale && locales) { + getI18nLinks = createGetI18nLinks(urls, defaultLocale, locales, finalSiteUrl); + } + + const urlData: SitemapItem[] = urls.map((url, i) => ({ + url, + links: getI18nLinks?.(i), + lastmod, + priority, + changefreq: changefreq as EnumChangefreq, + })); + + return urlData; +} + +type GetI18nLinks = (urlIndex: number) => SitemapItem['links'] | undefined; + +function createGetI18nLinks( + urls: string[], + defaultLocale: string, + locales: Record<string, string>, + finalSiteUrl: string, +): GetI18nLinks { + // `parsedI18nUrls` will have the same length as `urls`, matching correspondingly + const parsedI18nUrls = urls.map((url) => parseI18nUrl(url, defaultLocale, locales, finalSiteUrl)); + // Cache as multiple i18n URLs with the same path will have the same links + const i18nPathToLinksCache = new Map<string, SitemapItem['links']>(); + + return (urlIndex) => { + const i18nUrl = parsedI18nUrls[urlIndex]; + if (!i18nUrl) { + return undefined; + } + + const cached = i18nPathToLinksCache.get(i18nUrl.path); + if (cached) { + return cached; + } + + // Find all URLs with the same path (without the locale part), e.g. /en/foo and /es/foo + const links: NonNullable<SitemapItem['links']> = []; + for (let i = 0; i < parsedI18nUrls.length; i++) { + const parsed = parsedI18nUrls[i]; + if (parsed?.path === i18nUrl.path) { + links.push({ + url: urls[i], + lang: locales[parsed.locale], + }); + } + } + + // If 0 or 1 (which is itself), return undefined to not create any links. + // We also don't need to cache this as we know there's no other URLs that would've match this. + if (links.length <= 1) { + return undefined; + } + + i18nPathToLinksCache.set(i18nUrl.path, links); + return links; + }; +} diff --git a/packages/integrations/sitemap/src/index.ts b/packages/integrations/sitemap/src/index.ts new file mode 100644 index 000000000..078f78abb --- /dev/null +++ b/packages/integrations/sitemap/src/index.ts @@ -0,0 +1,195 @@ +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import type { AstroConfig, AstroIntegration } from 'astro'; +import type { EnumChangefreq, LinkItem as LinkItemBase, SitemapItemLoose } from 'sitemap'; +import { ZodError } from 'zod'; + +import { generateSitemap } from './generate-sitemap.js'; +import { validateOptions } from './validate-options.js'; +import { writeSitemap } from './write-sitemap.js'; + +export { EnumChangefreq as ChangeFreqEnum } from 'sitemap'; +export type ChangeFreq = `${EnumChangefreq}`; +export type SitemapItem = Pick< + SitemapItemLoose, + 'url' | 'lastmod' | 'changefreq' | 'priority' | 'links' +>; +export type LinkItem = LinkItemBase; + +export type SitemapOptions = + | { + filenameBase?: string; + filter?(page: string): boolean; + customPages?: string[]; + + i18n?: { + defaultLocale: string; + locales: Record<string, string>; + }; + // number of entries per sitemap file + entryLimit?: number; + + // sitemap specific + changefreq?: ChangeFreq; + lastmod?: Date; + priority?: number; + + // called for each sitemap item just before to save them on disk, sync or async + serialize?(item: SitemapItem): SitemapItem | Promise<SitemapItem | undefined> | undefined; + + xslURL?: string; + } + | undefined; + +function formatConfigErrorMessage(err: ZodError) { + const errorList = err.issues.map((issue) => ` ${issue.path.join('.')} ${issue.message + '.'}`); + return errorList.join('\n'); +} + +const PKG_NAME = '@astrojs/sitemap'; +const STATUS_CODE_PAGES = new Set(['404', '500']); + +const isStatusCodePage = (locales: string[]) => { + const statusPathNames = new Set( + locales + .flatMap((locale) => [...STATUS_CODE_PAGES].map((page) => `${locale}/${page}`)) + .concat([...STATUS_CODE_PAGES]), + ); + + return (pathname: string): boolean => { + if (pathname.endsWith('/')) { + pathname = pathname.slice(0, -1); + } + if (pathname.startsWith('/')) { + pathname = pathname.slice(1); + } + return statusPathNames.has(pathname); + }; +}; +const createPlugin = (options?: SitemapOptions): AstroIntegration => { + let config: AstroConfig; + + return { + name: PKG_NAME, + + hooks: { + 'astro:config:done': async ({ config: cfg }) => { + config = cfg; + }, + + 'astro:build:done': async ({ dir, routes, pages, logger }) => { + try { + if (!config.site) { + logger.warn( + 'The Sitemap integration requires the `site` astro.config option. Skipping.', + ); + return; + } + + const opts = validateOptions(config.site, options); + + const { filenameBase, filter, customPages, serialize, entryLimit } = opts; + + const outFile = `${filenameBase}-index.xml`; + const finalSiteUrl = new URL(config.base, config.site); + const shouldIgnoreStatus = isStatusCodePage(Object.keys(opts.i18n?.locales ?? {})); + let pageUrls = pages + .filter((p) => !shouldIgnoreStatus(p.pathname)) + .map((p) => { + if (p.pathname !== '' && !finalSiteUrl.pathname.endsWith('/')) + finalSiteUrl.pathname += '/'; + if (p.pathname.startsWith('/')) p.pathname = p.pathname.slice(1); + const fullPath = finalSiteUrl.pathname + p.pathname; + return new URL(fullPath, finalSiteUrl).href; + }); + + const routeUrls = routes.reduce<string[]>((urls, r) => { + // Only expose pages, not endpoints or redirects + if (r.type !== 'page') return urls; + + /** + * Dynamic URLs have entries with `undefined` pathnames + */ + if (r.pathname) { + if (shouldIgnoreStatus(r.pathname ?? r.route)) return urls; + + // `finalSiteUrl` may end with a trailing slash + // or not because of base paths. + let fullPath = finalSiteUrl.pathname; + if (fullPath.endsWith('/')) fullPath += r.generate(r.pathname).substring(1); + else fullPath += r.generate(r.pathname); + + const newUrl = new URL(fullPath, finalSiteUrl).href; + + if (config.trailingSlash === 'never') { + urls.push(newUrl); + } else if (config.build.format === 'directory' && !newUrl.endsWith('/')) { + urls.push(newUrl + '/'); + } else { + urls.push(newUrl); + } + } + + return urls; + }, []); + + pageUrls = Array.from(new Set([...pageUrls, ...routeUrls, ...(customPages ?? [])])); + + if (filter) { + pageUrls = pageUrls.filter(filter); + } + + if (pageUrls.length === 0) { + logger.warn(`No pages found!\n\`${outFile}\` not created.`); + return; + } + + let urlData = generateSitemap(pageUrls, finalSiteUrl.href, opts); + + if (serialize) { + try { + const serializedUrls: SitemapItem[] = []; + for (const item of urlData) { + const serialized = await Promise.resolve(serialize(item)); + if (serialized) { + serializedUrls.push(serialized); + } + } + if (serializedUrls.length === 0) { + logger.warn('No pages found!'); + return; + } + urlData = serializedUrls; + } catch (err) { + logger.error(`Error serializing pages\n${(err as any).toString()}`); + return; + } + } + const destDir = fileURLToPath(dir); + const xslURL = opts.xslURL ? new URL(opts.xslURL, finalSiteUrl).href : undefined; + await writeSitemap( + { + filenameBase: filenameBase, + hostname: finalSiteUrl.href, + destinationDir: destDir, + publicBasePath: config.base, + sourceData: urlData, + limit: entryLimit, + xslURL: xslURL, + }, + config, + ); + logger.info(`\`${outFile}\` created at \`${path.relative(process.cwd(), destDir)}\``); + } catch (err) { + if (err instanceof ZodError) { + logger.warn(formatConfigErrorMessage(err)); + } else { + throw err; + } + } + }, + }, + }; +}; + +export default createPlugin; diff --git a/packages/integrations/sitemap/src/schema.ts b/packages/integrations/sitemap/src/schema.ts new file mode 100644 index 000000000..0ab9d672d --- /dev/null +++ b/packages/integrations/sitemap/src/schema.ts @@ -0,0 +1,41 @@ +import { EnumChangefreq as ChangeFreq } from 'sitemap'; +import { z } from 'zod'; +import { SITEMAP_CONFIG_DEFAULTS } from './config-defaults.js'; + +const localeKeySchema = z.string().min(1); + +export const SitemapOptionsSchema = z + .object({ + filenameBase: z.string().optional().default(SITEMAP_CONFIG_DEFAULTS.filenameBase), + filter: z.function().args(z.string()).returns(z.boolean()).optional(), + customPages: z.string().url().array().optional(), + canonicalURL: z.string().url().optional(), + xslURL: z.string().optional(), + + i18n: z + .object({ + defaultLocale: localeKeySchema, + locales: z.record( + localeKeySchema, + z + .string() + .min(2) + .regex(/^[a-zA-Z\-]+$/gm, { + message: 'Only English alphabet symbols and hyphen allowed', + }), + ), + }) + .refine((val) => !val || val.locales[val.defaultLocale], { + message: '`defaultLocale` must exist in `locales` keys', + }) + .optional(), + + entryLimit: z.number().nonnegative().optional().default(SITEMAP_CONFIG_DEFAULTS.entryLimit), + serialize: z.function().args(z.any()).returns(z.any()).optional(), + + changefreq: z.nativeEnum(ChangeFreq).optional(), + lastmod: z.date().optional(), + priority: z.number().min(0).max(1).optional(), + }) + .strict() + .default(SITEMAP_CONFIG_DEFAULTS); diff --git a/packages/integrations/sitemap/src/utils/parse-i18n-url.ts b/packages/integrations/sitemap/src/utils/parse-i18n-url.ts new file mode 100644 index 000000000..86221ca9d --- /dev/null +++ b/packages/integrations/sitemap/src/utils/parse-i18n-url.ts @@ -0,0 +1,42 @@ +interface ParsedI18nUrl { + locale: string; + path: string; +} + +// NOTE: The parameters have been schema-validated with Zod +export function parseI18nUrl( + url: string, + defaultLocale: string, + locales: Record<string, string>, + base: string, +): ParsedI18nUrl | undefined { + if (!url.startsWith(base)) { + return undefined; + } + + let s = url.slice(base.length); + + // Handle root URL + if (!s || s === '/') { + return { locale: defaultLocale, path: '/' }; + } + + if (s[0] !== '/') { + s = '/' + s; + } + + // Get locale from path, e.g. + // "/en-US/" -> "en-US" + // "/en-US/foo" -> "en-US" + const locale = s.split('/')[1]; + if (locale in locales) { + // "/en-US/foo" -> "/foo" + let path = s.slice(1 + locale.length); + if (!path) { + path = '/'; + } + return { locale, path }; + } + + return { locale: defaultLocale, path: s }; +} diff --git a/packages/integrations/sitemap/src/validate-options.ts b/packages/integrations/sitemap/src/validate-options.ts new file mode 100644 index 000000000..f51750ff5 --- /dev/null +++ b/packages/integrations/sitemap/src/validate-options.ts @@ -0,0 +1,22 @@ +import { z } from 'zod'; +import type { SitemapOptions } from './index.js'; +import { SitemapOptionsSchema } from './schema.js'; + +// @internal +export const validateOptions = (site: string | undefined, opts: SitemapOptions) => { + const result = SitemapOptionsSchema.parse(opts); + + z.object({ + site: z.string().optional(), // Astro takes care of `site`: how to validate, transform and refine + canonicalURL: z.string().optional(), // `canonicalURL` is already validated in prev step + }) + .refine((options) => options.site || options.canonicalURL, { + message: 'Required `site` astro.config option or `canonicalURL` integration option', + }) + .parse({ + site, + canonicalURL: result.canonicalURL, + }); + + return result; +}; diff --git a/packages/integrations/sitemap/src/write-sitemap.ts b/packages/integrations/sitemap/src/write-sitemap.ts new file mode 100644 index 000000000..939bd91be --- /dev/null +++ b/packages/integrations/sitemap/src/write-sitemap.ts @@ -0,0 +1,75 @@ +import { type WriteStream, createWriteStream } from 'node:fs'; +import { mkdir } from 'node:fs/promises'; +import { normalize, resolve } from 'node:path'; +import { Readable, pipeline } from 'node:stream'; +import { promisify } from 'node:util'; +import replace from 'stream-replace-string'; + +import { SitemapAndIndexStream, SitemapStream } from 'sitemap'; + +import type { AstroConfig } from 'astro'; +import type { SitemapItem } from './index.js'; + +type WriteSitemapConfig = { + filenameBase: string; + hostname: string; + sitemapHostname?: string; + sourceData: SitemapItem[]; + destinationDir: string; + publicBasePath?: string; + limit?: number; + xslURL?: string; +}; + +// adapted from sitemap.js/sitemap-simple +export async function writeSitemap( + { + filenameBase, + hostname, + sitemapHostname = hostname, + sourceData, + destinationDir, + limit = 50000, + publicBasePath = './', + xslURL: xslUrl, + }: WriteSitemapConfig, + astroConfig: AstroConfig, +) { + await mkdir(destinationDir, { recursive: true }); + + const sitemapAndIndexStream = new SitemapAndIndexStream({ + limit, + xslUrl, + getSitemapStream: (i) => { + const sitemapStream = new SitemapStream({ + hostname, + xslUrl, + }); + const path = `./${filenameBase}-${i}.xml`; + const writePath = resolve(destinationDir, path); + if (!publicBasePath.endsWith('/')) { + publicBasePath += '/'; + } + const publicPath = normalize(publicBasePath + path); + + let stream: WriteStream; + if (astroConfig.trailingSlash === 'never' || astroConfig.build.format === 'file') { + // workaround for trailing slash issue in sitemap.js: https://github.com/ekalinin/sitemap.js/issues/403 + const host = hostname.endsWith('/') ? hostname.slice(0, -1) : hostname; + const searchStr = `<loc>${host}/</loc>`; + const replaceStr = `<loc>${host}</loc>`; + stream = sitemapStream + .pipe(replace(searchStr, replaceStr)) + .pipe(createWriteStream(writePath)); + } else { + stream = sitemapStream.pipe(createWriteStream(writePath)); + } + + return [new URL(publicPath, sitemapHostname).toString(), sitemapStream, stream]; + }, + }); + + const src = Readable.from(sourceData); + const indexPath = resolve(destinationDir, `./${filenameBase}-index.xml`); + return promisify(pipeline)(src, sitemapAndIndexStream, createWriteStream(indexPath)); +} |