// Builds server/data/wordlists/common.txt — the clue-able word subset the Mini // Crossword generator fills from. It intersects the ENABLE dictionary with the // Norvig word-frequency list, applying a *tighter rank cutoff for short words*: // short obscure words (ANI, CESS, SEG) are the worst crossword fill, and the // truly common short words rank very high, so a low cutoff filters the junk // while keeping enough vocabulary to fill. // // Usage: node server/scripts/buildCommonWords.js // Caches the 5 MB frequency list at /tmp/count_1w.txt to avoid re-downloading. import fs from 'node:fs'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const ENABLE_PATH = path.join(__dirname, '../data/wordlists/enable1.txt'); const OUT_PATH = path.join(__dirname, '../data/wordlists/common.txt'); const FREQ_URL = 'https://norvig.com/ngrams/count_1w.txt'; const FREQ_CACHE = '/tmp/count_1w.txt'; // Frequency-rank cutoff per word length. Short words get a much stricter bar. const CUTOFF = { 3: 6000, 4: 18000, 5: 45000, 6: 70000, 7: 70000 }; async function loadFreq() { if (fs.existsSync(FREQ_CACHE)) return fs.readFileSync(FREQ_CACHE, 'utf8'); console.log('Downloading frequency list...'); const text = await (await fetch(FREQ_URL)).text(); fs.writeFileSync(FREQ_CACHE, text); return text; } async function main() { const enable = new Set( fs.readFileSync(ENABLE_PATH, 'utf8').split('\n').map((w) => w.trim().toUpperCase()), ); const ranked = (await loadFreq()) .split('\n').map((l) => l.split('\t')[0]).filter(Boolean).map((w) => w.toUpperCase()); const out = []; const byLen = {}; ranked.forEach((w, rank) => { const len = w.length; if (!CUTOFF[len] || rank >= CUTOFF[len]) return; if (!/^[A-Z]+$/.test(w) || !enable.has(w)) return; out.push(w); byLen[len] = (byLen[len] || 0) + 1; }); fs.writeFileSync(OUT_PATH, out.join('\n') + '\n'); console.log(`Wrote ${out.length} common words to ${OUT_PATH}`); console.log('Per length:', JSON.stringify(byLen)); } main();