Create vocabulary md files (one for letter) with words extracted from notes

EmanueleTinari · May 19, 2025, 10:29pm

What I’m trying to do

Hello from Italy (so, I’m sorry for my bad English).
I’m trying to write a DataviewJS that should extract from one or more files (notes) all unique words in it, ordered its alphabetically, and create new notes in a specific folder, one for every letter in alphabet.
I thought to give

a starting folder (where in this moment only a md file is in),
a destination folder for ‘letters’ md’s created,
a minimum words length (to exclude conjunction, articles, pronouns, preposition etc…),
an excluding prefix for file in starting folder (in my case, files that starting with ‘_’),
an ‘already scanned files’ (put in destination folder too) to register files already elaborated and to not rescan if script is relaunched (I can choose if rescan or not starting script)

Script must exclude Yalm Frontmatter elements, and footpage (in my case from last 3 lines “—” to eof.
Every unique world found must be write in its “letter.md” file (a.md, b.md, c.md, and so on…) saved in “Vocabols” folder, with on the left the same word found and on the right the link to the file where word is found; if it is found in more than one file, all file links must be write on right side, in second or third lines as shown below:


1st word found	link to 1st file
	link to 2nd file
	link to 3rd file…
2nd word found	Link to 1st file
	link to 2nd file…

(I use a table for render better my idea, but if I can’t extract nothing, I can’t build the part of code dedicated to write extracted datas in a table form, so this part of code will be wrote in next “version”).
I admit use of IA to help me create script.
Unfortunately I trow in error with TFile: Obsidian Console CTRL+Shift+I says that "TFile is undefined

I got
Obsidian updated @ v 1.8.10
Dataview updated @ v 0.5.68
Templater updated @ v 2.11.1

I try below script in my “normal” vault but also in a new, fresh and clean vault, with only two plugin, above indicated, installed.

Things I have tried


//```DataviewJS
// (I wrote correctly in my note, but here wish utilize JavaScript color coding to see better my pasted code)

// --- Configuration ---

// Case-sensitive prefix for folders to scan
const folderPrefix = "Document";
// Case-sensitive prefix for files to exclude
const excludedPrefix = "_";
// Minimum length for a word to be included
const minWordLenght = 3;
// Folder where the output files will be created
const outputFolder = "Vocaboli";
// File to track already scanned files
const alreadyScannedFiles = "already_scanned.md";



// --- Helper Functions ---
function getWordsFromText(text) {
    if (!text || typeof text !== 'string') {
        return [];
    }
    const wordsArray = text.toLowerCase().match(/\b[a-z']+\b/g);
    if (!wordsArray) {
        return [];
    }
    return wordsArray.filter(word =>
        word.length >= minWordLenght &&
        !/^\d+$/.test(word) &&
        /[a-z]/.test(word)
    );
}

// --- File Management Functions ---
async function ensureFolderExists(folderPath) {
    const adapter = app.vault.adapter;
    try {
        const folderExists = await adapter.exists(folderPath);
        if (folderExists) {
            const stat = await adapter.stat(folderPath);
            if (stat.type === 'folder') {
                return true;
            } else {
                console.error(`"${folderPath}" exists but is a file, not a folder.`);
                new Notice(`Error: "${folderPath}" is a file. Expected a folder.`);
                return false;
            }
        } else {
            await adapter.mkdir(folderPath);
            console.log(`Successfully created folder "${folderPath}".`);
            return true;
        }
    } catch (error) {
        console.error(`Error ensuring folder "${folderPath}" exists:`, error);
        new Notice(`Failed to ensure folder "${folderPath}". Check console.`);
        return false;
    }
}

async function getAlreadyScannedFiles() {
    const filePath = `${outputFolder}/${alreadyScannedFiles}`;
    try {
        // Check if TFile is defined
        if (typeof TFile === 'undefined') {
            console.error("TFile is not defined in this environment. Cannot reliably check file type.");
            // Fallback or error handling if TFile is not available
        }

        const abstractFile = app.vault.getAbstractFileByPath(filePath);
        if (abstractFile instanceof TFile) { // This is the line that might cause "TFile is not defined"
            const content = await app.vault.read(abstractFile);
            return new Set(content.split("\n").map(line => line.trim()).filter(line => line !== ""));
        } else if (abstractFile === null) { // File does not exist
            console.log(`File "${filePath}" not found. Assuming no files have been scanned yet.`);
            return new Set();
        } else { // It exists but is not a TFile (e.g., a TFolder)
            console.warn(`"${filePath}" exists but is not a regular file. Assuming no files have been scanned yet.`);
            return new Set();
        }
    } catch (e) {
        console.log(`Error reading "${filePath}". Assuming no files have been scanned yet.`, e);
        return new Set();
    }
}

async function writeDataToFile(filePath, content) {
    try {
        const abstractFile = app.vault.getAbstractFileByPath(filePath);
        if (abstractFile instanceof TFile) {
            await app.vault.modify(abstractFile, content);
        } else if (abstractFile === null) { // File does not exist, create it
            await app.vault.create(filePath, content);
        } else { // Path exists but is a folder
             console.error(`Cannot write to "${filePath}" as it is a folder.`);
             new Notice(`Error: Cannot write to "${filePath}", it's a folder.`);
             return false;
        }
        console.log(`Successfully wrote to "${filePath}".`);
        return true;
    } catch (e) {
        console.error(`Failed to write to "${filePath}":`, e);
        new Notice(`Failed to write to "${filePath}". Check console for details.`);
        return false;
    }
}

async function writeAlreadyScannedFiles(filesSet) {
    const filePath = `${outputFolder}/${alreadyScannedFiles}`;
    const content = Array.from(filesSet).sort().join("\n");
    await writeDataToFile(filePath, content);
}

async function writeWordsToLetterFile(letter, wordsSet) {
    const filePath = `${outputFolder}/${letter}.md`;
    const content = Array.from(wordsSet).sort().join("\n");
    await writeDataToFile(filePath, content);
}

// --- Main Logic ---
(async () => {
    // Debug: Check if TFile is available
    if (typeof TFile === 'undefined') {
        console.error("CRITICAL: TFile is undefined in the DataviewJS environment. File operations might fail. Please check your Obsidian and Dataview setup.");
        new Notice("CRITICAL: TFile is undefined. Check console.", 5000);
        // Depending on severity, you might want to return early
    }

    dv.paragraph("Starting word extraction process...");

    // 1. Ensure the output folder exists
    if (!await ensureFolderExists(outputFolder)) {
        dv.paragraph(`Failed to create or access folder "${outputFolder}". Aborting.`);
        return;
    }

    // 2. Load the list of already scanned files
    let alreadyScanned = await getAlreadyScannedFiles();

    // 3. Prompt the user to rescan or not
    const shouldRescan = confirm("Do you want to rescan all files? (Press OK for Yes, Cancel for No to scan only new files.)");

    if (shouldRescan) {
        console.log("Rescanning all files.");
        new Notice("Rescanning all files.", 3000);
        alreadyScanned = new Set(); // Clear the set
        // Clear existing letter files
        for (let i = 0; i < 26; i++) {
            const letter = String.fromCharCode(97 + i); // a, b, c, ...
            await writeWordsToLetterFile(letter, new Set()); // Clear the file
        }
        // Clear the already_scanned.md file as well
        await writeDataToFile(`${outputFolder}/${alreadyScannedFiles}`, "");
    } else {
        console.log("Scanning only new files.");
        new Notice("Scanning only new files.", 3000);
    }

    // 4. Get all pages that are in a folder starting with the folderPrefix
    //    and exclude files starting with "_"
    const pagesToConsider = dv.pages()
        .where(p => p.file &&
                     p.file.folder &&
                     p.file.folder.startsWith(folderPrefix) &&
                     !p.file.name.startsWith('_'));

    // 5. Filter out already scanned files (if not rescanning)
    const filesToScan = shouldRescan ? pagesToConsider : pagesToConsider.filter(p => !alreadyScanned.has(p.file.path));

    if (filesToScan.length === 0) {
        dv.paragraph(shouldRescan ? "No files found to scan in specified directories." : "No new files to scan.");
        // Still update already_scanned.md if we didn't rescan, in case it was empty or needs creation
        if (!shouldRescan) await writeAlreadyScannedFiles(alreadyScanned);
        return;
    }
    new Notice(`Found ${filesToScan.length} files to process.`, 3000);

    // 6. Create a map to store words for each letter
    //    Structure: wordMap = { "a": Set<"word1", "word2">, "b": Set<...>, ... }
    const wordMapByLetter = new Map();
    for (let i = 0; i < 26; i++) {
        wordMapByLetter.set(String.fromCharCode(97 + i), new Set());
    }

    // 7. Process each page
    let processedFileCount = 0;
    for (const page of filesToScan) {
        try {
            const abstractFile = app.vault.getAbstractFileByPath(page.file.path);
            if (abstractFile instanceof TFile) { // Check if it's a file
                const content = await app.vault.read(abstractFile);

                if (content) {
                    // Exclude content within YAML frontmatter
                    const contentWithoutFrontmatter = content.replace(/^---[\s\S]*?---\s*/m, "");

                    const wordsInContent = getWordsFromText(contentWithoutFrontmatter);

                    for (const word of wordsInContent) {
                        const firstLetter = word.charAt(0).toLowerCase();
                        if (wordMapByLetter.has(firstLetter)) {
                            wordMapByLetter.get(firstLetter).add(word);
                        }
                    }
                    alreadyScanned.add(page.file.path);
                    processedFileCount++;
                }
            } else {
                console.warn(`Skipping "${page.file.path}" as it's not a regular file or TFile is undefined.`);
            }
        } catch (e) {
            console.error(`Error processing file "${page.file.path}":`, e);
            new Notice(`Error processing file "${page.file.path}". Check console.`, 5000);
        }
    }

    // 8. Write words to separate files
    if (processedFileCount > 0 || shouldRescan) { // Only write if new words were processed or if rescanning (to clear files)
        for (const [letter, wordsSet] of wordMapByLetter) {
            // If not rescanning, we need to load existing words and merge
            if (!shouldRescan) {
                const existingLetterFilePath = `${outputFolder}/${letter}.md`;
                const existingAbstractFile = app.vault.getAbstractFileByPath(existingLetterFilePath);
                if (existingAbstractFile instanceof TFile) {
                    const existingContent = await app.vault.read(existingAbstractFile);
                    existingContent.split("\n").map(line => line.trim()).filter(line => line !== "").forEach(word => wordsSet.add(word));
                }
            }
            await writeWordsToLetterFile(letter, wordsSet);
        }
    }

    // 9. Update the list of already scanned files
    await writeAlreadyScannedFiles(alreadyScanned);

    dv.paragraph(`Word extraction complete. ${processedFileCount} files processed. Check the "${outputFolder}" folder for the results.`);
    new Notice("Word extraction complete!", 5000);
})();

Is there anybody who can help me? Thanks in advance
Emanuele