stats/scripts/add-internal-links.mjs

#!/usr/bin/env node

/**
 * Adds internal links to feature pages across MDX content files.
 *
 * Rules:
 * - Only links the FIRST mention of each feature per file
 * - Skips code blocks, inline code, existing links, headings, JSX tags, imports
 * - Skips if the feature URL is already linked somewhere on the page
 * - Skips "data retention" (not about the retention feature)
 * - Adds `updated: YYYY-MM-DD` to frontmatter of modified articles & guides
 */

import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';

const __dirname = path.dirname(fileURLToPath(import.meta.url));
const ROOT = path.resolve(__dirname, '..');
const CONTENT_DIR = path.join(ROOT, 'apps/public/content');
const TODAY = '2026-02-07';

// ── Feature definitions ─────────────────────────────────────────────
// Patterns are tried in order; first match wins for each feature.
// Longer / more specific patterns come first to avoid partial matches.
const FEATURES = [
  {
    slug: 'event-tracking',
    url: '/features/event-tracking',
    patterns: ['event tracking'],
  },
  {
    slug: 'session-tracking',
    url: '/features/session-tracking',
    patterns: ['session tracking'],
  },
  {
    slug: 'revenue-tracking',
    url: '/features/revenue-tracking',
    patterns: ['revenue tracking'],
  },
  {
    slug: 'data-visualization',
    url: '/features/data-visualization',
    patterns: ['data visualization'],
  },
  {
    slug: 'identify-users',
    url: '/features/identify-users',
    patterns: ['identify users', 'user identification'],
  },
  {
    slug: 'web-analytics',
    url: '/features/web-analytics',
    patterns: ['web analytics'],
  },
  {
    slug: 'funnels',
    url: '/features/funnels',
    // "conversion funnel(s)" links to funnels, not conversion
    patterns: [
      'conversion funnels',
      'conversion funnel',
      'funnel analysis',
      'funnels',
      'funnel',
    ],
  },
  {
    slug: 'retention',
    url: '/features/retention',
    // "retention" alone is included but guarded by excludeBefore
    patterns: [
      'retention analysis',
      'user retention',
      'retention rates',
      'retention rate',
      'retention',
    ],
    excludeBefore: ['data', 'unlimited'], // skip "data retention", "unlimited retention"
    excludeAfter: ['period', 'policy', 'limit', 'of data'],
  },
  {
    slug: 'conversion',
    url: '/features/conversion',
    patterns: [
      'conversion tracking',
      'conversion rates',
      'conversion rate',
      'conversion paths',
      'conversions',
      'conversion',
    ],
    excludeBefore: ['data'],
  },
];

// Directories to scan (relative to CONTENT_DIR)
const DIRS = ['articles', 'guides', 'docs', 'pages'];
// Only these dirs get the `updated` frontmatter field
const DIRS_WITH_UPDATED = ['articles', 'guides'];

// ── Helpers ──────────────────────────────────────────────────────────

/** Return an array of { start, end } ranges that should NOT be modified. */
function getSkipZones(text) {
  const zones = [];
  let m;

  // Fenced code blocks  ```…```
  const codeBlock = /```[\s\S]*?```/g;
  while ((m = codeBlock.exec(text))) {
    zones.push({ start: m.index, end: m.index + m[0].length });
  }

  // Inline code `…`
  const inlineCode = /`[^`\n]+`/g;
  while ((m = inlineCode.exec(text))) {
    zones.push({ start: m.index, end: m.index + m[0].length });
  }

  // Existing markdown links [text](url)
  const mdLink = /\[[^\]]*\]\([^)]*\)/g;
  while ((m = mdLink.exec(text))) {
    zones.push({ start: m.index, end: m.index + m[0].length });
  }

  // Headings  # … (entire line)
  const heading = /^#{1,6}\s+.+$/gm;
  while ((m = heading.exec(text))) {
    zones.push({ start: m.index, end: m.index + m[0].length });
  }

  // JSX / HTML tags (attributes may contain feature words)
  const jsxTag = /<[^>]+>/g;
  while ((m = jsxTag.exec(text))) {
    zones.push({ start: m.index, end: m.index + m[0].length });
  }

  // import statements
  const imp = /^import\s+.+$/gm;
  while ((m = imp.exec(text))) {
    zones.push({ start: m.index, end: m.index + m[0].length });
  }

  // Frontmatter block
  const fm = /^---[\s\S]*?---/;
  if ((m = fm.exec(text))) {
    zones.push({ start: m.index, end: m.index + m[0].length });
  }

  // Markdown table rows (| … |)
  const tableRow = /^\|.+\|$/gm;
  while ((m = tableRow.exec(text))) {
    zones.push({ start: m.index, end: m.index + m[0].length });
  }

  // > blockquote lines that contain links
  const bqLink = /^>\s.*\[.*\]\(.*\).*$/gm;
  while ((m = bqLink.exec(text))) {
    zones.push({ start: m.index, end: m.index + m[0].length });
  }

  return zones;
}

function overlapsSkipZone(pos, len, zones) {
  const end = pos + len;
  return zones.some((z) => !(end <= z.start || pos >= z.end));
}

function escapeRegex(s) {
  return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}

// ── Core processing ──────────────────────────────────────────────────

function processFile(filePath, dir) {
  let content = fs.readFileSync(filePath, 'utf8');

  const skipZones = getSkipZones(content);
  const changes = [];

  for (const feature of FEATURES) {
    // If the file already links to this feature URL, skip entirely
    if (content.includes(feature.url)) {
      continue;
    }

    let linked = false;

    for (const pattern of feature.patterns) {
      if (linked) {
        break;
      }

      const re = new RegExp(`\\b${escapeRegex(pattern)}\\b`, 'gi');
      let m;

      while ((m = re.exec(content))) {
        // In a skip zone?
        if (overlapsSkipZone(m.index, m[0].length, skipZones)) {
          continue;
        }

        // Check excludeBefore / excludeAfter
        if (feature.excludeBefore) {
          const before = content
            .slice(Math.max(0, m.index - 20), m.index)
            .toLowerCase();
          if (feature.excludeBefore.some((w) => before.endsWith(w + ' '))) {
            continue;
          }
        }
        if (feature.excludeAfter) {
          const after = content
            .slice(m.index + m[0].length, m.index + m[0].length + 20)
            .toLowerCase();
          if (feature.excludeAfter.some((w) => after.startsWith(' ' + w))) {
            continue;
          }
        }

        // Build replacement
        const replacement = `[${m[0]}](/features/${feature.slug})`;
        content =
          content.slice(0, m.index) +
          replacement +
          content.slice(m.index + m[0].length);

        // Add the new link as a skip zone and shift all subsequent zones
        const lenDiff = replacement.length - m[0].length;
        skipZones.push({ start: m.index, end: m.index + replacement.length });
        for (const z of skipZones) {
          if (z.start > m.index + m[0].length) {
            z.start += lenDiff;
            z.end += lenDiff;
          }
        }

        changes.push({ feature: feature.slug, matched: m[0] });
        linked = true;
        break;
      }
    }
  }

  if (changes.length === 0) {
    return null;
  }

  // Add / update the `updated` frontmatter field for articles & guides
  if (DIRS_WITH_UPDATED.includes(dir)) {
    const fmMatch = content.match(/^---\n([\s\S]*?)\n---/);
    if (fmMatch) {
      let fm = fmMatch[1];
      if (/^updated:/m.test(fm)) {
        fm = fm.replace(/^updated:\s*.+$/m, `updated: ${TODAY}`);
      } else if (/^date:/m.test(fm)) {
        fm = fm.replace(/^(date:\s*.+)$/m, `$1\nupdated: ${TODAY}`);
      } else {
        fm += `\nupdated: ${TODAY}`;
      }
      content = content.replace(fmMatch[0], `---\n${fm}\n---`);
    }
  }

  fs.writeFileSync(filePath, content, 'utf8');
  return changes;
}

// ── Walk directories ─────────────────────────────────────────────────

function walk(dir) {
  const entries = fs.readdirSync(dir, { withFileTypes: true });
  const files = [];
  for (const e of entries) {
    const full = path.join(dir, e.name);
    if (e.isDirectory()) {
      files.push(...walk(full));
    } else if (e.name.endsWith('.mdx')) {
      files.push(full);
    }
  }
  return files;
}

// ── Main ─────────────────────────────────────────────────────────────

const results = [];

for (const dir of DIRS) {
  const dirPath = path.join(CONTENT_DIR, dir);
  if (!fs.existsSync(dirPath)) {
    continue;
  }

  for (const file of walk(dirPath)) {
    const changes = processFile(file, dir);
    if (changes) {
      results.push({ file: path.relative(ROOT, file), changes });
    }
  }
}

console.log('=== Internal Linking Report ===\n');
console.log(`Total files modified: ${results.length}`);
console.log(
  `Total links added: ${results.reduce((s, r) => s + r.changes.length, 0)}\n`
);

for (const r of results) {
  console.log(`  ${r.file}`);
  for (const c of r.changes) {
    console.log(`    -> "${c.matched}" => /features/${c.feature}`);
  }
}
console.log('\nDone.');