55 lines
2.1 KiB
JavaScript
55 lines
2.1 KiB
JavaScript
// Builds server/data/wordlists/common.txt — the clue-able word subset the Mini
|
|
// Crossword generator fills from. It intersects the ENABLE dictionary with the
|
|
// Norvig word-frequency list, applying a *tighter rank cutoff for short words*:
|
|
// short obscure words (ANI, CESS, SEG) are the worst crossword fill, and the
|
|
// truly common short words rank very high, so a low cutoff filters the junk
|
|
// while keeping enough vocabulary to fill.
|
|
//
|
|
// Usage: node server/scripts/buildCommonWords.js
|
|
// Caches the 5 MB frequency list at /tmp/count_1w.txt to avoid re-downloading.
|
|
|
|
import fs from 'node:fs';
|
|
import path from 'node:path';
|
|
import { fileURLToPath } from 'node:url';
|
|
|
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
const ENABLE_PATH = path.join(__dirname, '../data/wordlists/enable1.txt');
|
|
const OUT_PATH = path.join(__dirname, '../data/wordlists/common.txt');
|
|
const FREQ_URL = 'https://norvig.com/ngrams/count_1w.txt';
|
|
const FREQ_CACHE = '/tmp/count_1w.txt';
|
|
|
|
// Frequency-rank cutoff per word length. Short words get a much stricter bar.
|
|
const CUTOFF = { 3: 6000, 4: 18000, 5: 45000, 6: 70000, 7: 70000 };
|
|
|
|
async function loadFreq() {
|
|
if (fs.existsSync(FREQ_CACHE)) return fs.readFileSync(FREQ_CACHE, 'utf8');
|
|
console.log('Downloading frequency list...');
|
|
const text = await (await fetch(FREQ_URL)).text();
|
|
fs.writeFileSync(FREQ_CACHE, text);
|
|
return text;
|
|
}
|
|
|
|
async function main() {
|
|
const enable = new Set(
|
|
fs.readFileSync(ENABLE_PATH, 'utf8').split('\n').map((w) => w.trim().toUpperCase()),
|
|
);
|
|
const ranked = (await loadFreq())
|
|
.split('\n').map((l) => l.split('\t')[0]).filter(Boolean).map((w) => w.toUpperCase());
|
|
|
|
const out = [];
|
|
const byLen = {};
|
|
ranked.forEach((w, rank) => {
|
|
const len = w.length;
|
|
if (!CUTOFF[len] || rank >= CUTOFF[len]) return;
|
|
if (!/^[A-Z]+$/.test(w) || !enable.has(w)) return;
|
|
out.push(w);
|
|
byLen[len] = (byLen[len] || 0) + 1;
|
|
});
|
|
|
|
fs.writeFileSync(OUT_PATH, out.join('\n') + '\n');
|
|
console.log(`Wrote ${out.length} common words to ${OUT_PATH}`);
|
|
console.log('Per length:', JSON.stringify(byLen));
|
|
}
|
|
|
|
main();
|