fertig-classic-games/server/scripts/buildCommonWords.js

55 lines
2.1 KiB
JavaScript

// Builds server/data/wordlists/common.txt — the clue-able word subset the Mini
// Crossword generator fills from. It intersects the ENABLE dictionary with the
// Norvig word-frequency list, applying a *tighter rank cutoff for short words*:
// short obscure words (ANI, CESS, SEG) are the worst crossword fill, and the
// truly common short words rank very high, so a low cutoff filters the junk
// while keeping enough vocabulary to fill.
//
// Usage: node server/scripts/buildCommonWords.js
// Caches the 5 MB frequency list at /tmp/count_1w.txt to avoid re-downloading.
import fs from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const ENABLE_PATH = path.join(__dirname, '../data/wordlists/enable1.txt');
const OUT_PATH = path.join(__dirname, '../data/wordlists/common.txt');
const FREQ_URL = 'https://norvig.com/ngrams/count_1w.txt';
const FREQ_CACHE = '/tmp/count_1w.txt';
// Frequency-rank cutoff per word length. Short words get a much stricter bar.
const CUTOFF = { 3: 6000, 4: 18000, 5: 45000, 6: 70000, 7: 70000 };
async function loadFreq() {
if (fs.existsSync(FREQ_CACHE)) return fs.readFileSync(FREQ_CACHE, 'utf8');
console.log('Downloading frequency list...');
const text = await (await fetch(FREQ_URL)).text();
fs.writeFileSync(FREQ_CACHE, text);
return text;
}
async function main() {
const enable = new Set(
fs.readFileSync(ENABLE_PATH, 'utf8').split('\n').map((w) => w.trim().toUpperCase()),
);
const ranked = (await loadFreq())
.split('\n').map((l) => l.split('\t')[0]).filter(Boolean).map((w) => w.toUpperCase());
const out = [];
const byLen = {};
ranked.forEach((w, rank) => {
const len = w.length;
if (!CUTOFF[len] || rank >= CUTOFF[len]) return;
if (!/^[A-Z]+$/.test(w) || !enable.has(w)) return;
out.push(w);
byLen[len] = (byLen[len] || 0) + 1;
});
fs.writeFileSync(OUT_PATH, out.join('\n') + '\n');
console.log(`Wrote ${out.length} common words to ${OUT_PATH}`);
console.log('Per length:', JSON.stringify(byLen));
}
main();