domenic · September 24, 2025 04:33
diff --git a/bunpro-anki-scraper.js b/bunpro-anki-scraper.js
 // Bunpro N1 Grammar Anki Card Generator
 // This script extracts grammar points and example sentences from Bunpro N1 deck
 // and formats them for Anki import

 // Written by Claude Code. Don't judge me on the code quality.

 const fs = require('fs');

 class BunproAnkiGenerator {
    constructor() {
        this.baseUrl = 'https://bunpro.jp';
        this.deckUrl = 'https://bunpro.jp/decks/lzqrdc/Bunpro-N1-Grammar';
        this.allCards = [];
    }

    // Function to extract grammar point URLs from the main deck page
    async extractGrammarPointUrls(deckHtml) {
        console.log('Parsing HTML for grammar points...');

        // Method 1: Parse HTML structure for grammar point cards
        const grammarPoints = this.parseGrammarPointCardsFromHTML(deckHtml);

        if (grammarPoints.length > 0) {
            console.log(`Successfully extracted ${grammarPoints.length} grammar points from HTML`);
            return grammarPoints;
        }

        // Method 2: Fallback - look for any grammar_points links
        console.log('Primary parsing failed, trying fallback method...');
        const fallbackPoints = this.extractGrammarPointLinksRegex(deckHtml);

        if (fallbackPoints.length > 0) {
            console.log(`Fallback method found ${fallbackPoints.length} grammar points`);
            return fallbackPoints;
        }

        // Method 3: Last resort - use known N1 grammar points
        console.log('HTML parsing failed, using known N1 grammar points as fallback...');
        return this.getKnownN1GrammarPoints();
    }

    // Parse grammar point cards from HTML using the structure you provided
    parseGrammarPointCardsFromHTML(html) {
        const grammarPoints = [];

        // Find all div elements with the grammar point card classes
        const cardRegex = /<div[^>]*js_decks-card_info[^>]*deck-info-card[^>]*>(.*?)<\/div>(?:\s*<\/div>)*/gs;
        const cardMatches = html.matchAll(cardRegex);

        for (const cardMatch of cardMatches) {
            const cardHTML = cardMatch[1];

            // Extract the href from the anchor tag
            const hrefRegex = /<a[^>]*href="([^"]*grammar_points[^"]*)"[^>]*>/;
            const hrefMatch = cardHTML.match(hrefRegex);

            // Extract the grammar point name from the title element
            const titleRegex = /<p[^>]*(?:deck-card-title|v-text_large--400)[^>]*>(.*?)<\/p>/;
            const titleMatch = cardHTML.match(titleRegex);

            // Extract the description from the span element
            const descRegex = /<span[^>]*u-text_body--400[^>]*u-text_fg-secondary[^>]*>(.*?)<\/span>/;
            const descMatch = cardHTML.match(descRegex);

            if (hrefMatch && titleMatch) {
                const href = hrefMatch[1];
                const name = titleMatch[1].trim();
                const description = descMatch ? descMatch[1].trim() : '';
                const fullUrl = href.startsWith('http') ? href : `https://bunpro.jp${href}`;

                grammarPoints.push({
                    name: name,
                    url: fullUrl,
                    description: description
                });
            }
        }

        return grammarPoints;
    }

    // Fallback method: Extract any grammar_points links from HTML
    extractGrammarPointLinksRegex(html) {
        const grammarPoints = [];

        // Find all links to grammar_points
        const linkRegex = /<a[^>]*href="([^"]*\/grammar_points\/[^"]*)"[^>]*>(.*?)<\/a>/gs;
        const linkMatches = html.matchAll(linkRegex);

        for (const linkMatch of linkMatches) {
            const href = linkMatch[1];
            const linkHTML = linkMatch[0];

            // Try to extract title from various possible patterns
            const titlePatterns = [
                /<p[^>]*(?:deck-card-title|v-text_large--400)[^>]*>(.*?)<\/p>/,
                /<div[^>]*title[^>]*>(.*?)<\/div>/,
                />([\u3040-\u309f\u30a0-\u30ff\u4e00-\u9faf\s\+\(\)（）・～ー]+)</  // Japanese characters
            ];

            let name = null;
            for (const pattern of titlePatterns) {
                const match = linkHTML.match(pattern);
                if (match) {
                    name = match[1].trim();
                    break;
                }
            }

            // Try to extract description
            const descPattern = /<span[^>]*u-text_body--400[^>]*>(.*?)<\/span>/;
            const descMatch = linkHTML.match(descPattern);
            const description = descMatch ? descMatch[1].trim() : '';

            if (name && href) {
                const fullUrl = href.startsWith('http') ? href : `https://bunpro.jp${href}`;

                grammarPoints.push({
                    name: name,
                    url: fullUrl,
                    description: description
                });
            }
        }

        // Remove duplicates
        const uniquePoints = grammarPoints.filter((point, index, self) =>
            index === self.findIndex(p => p.url === point.url)
        );

        return uniquePoints;
    }

    // Last resort: Known N1 grammar points (minimal list for absolute fallback)
    getKnownN1GrammarPoints() {
        return [
            { name: 'に至るまで', url: 'https://bunpro.jp/grammar_points/%E3%81%AB%E8%87%B3%E3%82%8B%E3%81%BE%E3%81%A7', description: '(Everything) from...to...' },
            { name: 'ならまだしも', url: 'https://bunpro.jp/grammar_points/%E3%81%AA%E3%82%89%E3%81%BE%E3%81%A0%E3%81%97%E3%82%82', description: 'A is fine...but B is not' },
            { name: 'とあって', url: 'https://bunpro.jp/grammar_points/%E3%81%A8%E3%81%82%E3%81%A3%E3%81%A6', description: 'Since...(it is no wonder that)' }
        ];
    }

    // Function to parse individual grammar point pages using JSON data
    parseGrammarPointPage(htmlContent, grammarPoint, grammarUrl, grammarMeaning = '') {
        const cards = [];

        try {
            // First, try to extract data from __NEXT_DATA__ JSON
            const jsonCards = this.parseGrammarPointFromJSON(htmlContent, grammarPoint, grammarUrl, grammarMeaning);
            if (jsonCards.length > 0) {
                console.log(`Extracted ${jsonCards.length} cards from JSON data`);
                return jsonCards;
            }
        } catch (error) {
            console.log('JSON parsing failed, falling back to HTML parsing:', error.message);
        }

        // Fallback to original HTML parsing method
        return this.parseGrammarPointFromHTML(htmlContent, grammarPoint, grammarUrl, grammarMeaning);
    }

    // New method: Parse grammar point data from __NEXT_DATA__ JSON
    parseGrammarPointFromJSON(htmlContent, grammarPoint, grammarUrl, grammarMeaning) {
        const cards = [];

        // Extract JSON data from __NEXT_DATA__ script tag
        const jsonMatch = htmlContent.match(/<script id="__NEXT_DATA__" type="application\/json">(.*?)<\/script>/s);
        if (!jsonMatch) {
            throw new Error('No __NEXT_DATA__ found');
        }

        const jsonData = JSON.parse(jsonMatch[1]);
        const studyQuestions = jsonData?.props?.pageProps?.included?.studyQuestions || [];

        if (studyQuestions.length === 0) {
            throw new Error('No study questions found in JSON');
        }

        studyQuestions.forEach((question, index) => {
            try {
                // Skip readonly questions, focus on examples and cloze questions
                if (question.question_type === 'readonly' && question.used_in === 'writeups') {
                    return;
                }

                const content = question.content || '';
                const translation = question.translation || '';
                const answer = question.answer || grammarPoint;

                if (content && translation) {
                    // Clean up the Japanese sentence and remove furigana
                    let japaneseSentence = content
                        .replace(/<span[^>]*study-area-input[^>]*>.*?<\/span>/g, answer) // Replace input with answer
                        .replace(/<span[^>]*name-highlight[^>]*>(.*?)<\/span>/g, '$1') // Remove name highlighting
                        .replace(/<[^>]*>/g, '') // Remove all other HTML tags
                        .replace(/\r?\n/g, '') // Remove newlines
                        .trim();

                    // Remove furigana from Japanese text
                    japaneseSentence = this.removeFurigana(japaneseSentence);

                    // Clean up the English translation
                    let englishTranslation = translation
                        .replace(/<span[^>]*name-highlight[^>]*>(.*?)<\/span>/g, '$1') // Remove name highlighting
                        .replace(/<strong>/g, '**').replace(/<\/strong>/g, '**') // Convert strong tags temporarily
                        .replace(/<[^>]*>/g, '') // Remove all HTML tags
                        .replace(/\*\*/g, '') // Remove the temporary markers
                        .replace(/\r?\n/g, ' ') // Replace newlines with spaces
                        .trim();

                    // Skip if either sentence is too short or doesn't contain the grammar point
                    if (japaneseSentence.length < 10 || englishTranslation.length < 10) {
                        return;
                    }

                    // Highlight the grammar point in the Japanese sentence
                    let frontText = japaneseSentence;
                    if (answer && frontText.includes(answer)) {
                        frontText = frontText.replace(
                            new RegExp(answer.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g'),
                            `<strong>${answer}</strong>`
                        );
                    }

                    // Create the back with English translation and grammar info
                    const cleanGrammarPoint = this.removeFurigana(grammarPoint);
                    const grammarInfo = `<a lang="ja" href="${grammarUrl}">${cleanGrammarPoint}</a>${grammarMeaning ? ', ' + grammarMeaning : ''}`;

                    cards.push({
                        front: `<p>${frontText}</p>`,
                        back: `<p>${englishTranslation}</p>\n<p>${grammarInfo}</p>`
                    });
                }
            } catch (error) {
                console.error(`Error processing question ${index}:`, error);
            }
        });

        return cards;
    }

    // Original HTML parsing method (fallback)
    parseGrammarPointFromHTML(htmlContent, grammarPoint, grammarUrl, grammarMeaning) {
        const cards = [];
        const lines = htmlContent.split('\n').map(line => line.trim()).filter(line => line);
        const sentencePairs = [];

        for (let i = 0; i < lines.length; i++) {
            const line = lines[i];

            // Detect Japanese sentences
            const japaneseCharCount = (line.match(/[\u3040-\u309f\u30a0-\u30ff\u4e00-\u9faf]/g) || []).length;
            const totalCharCount = line.replace(/\s/g, '').length;
            const japaneseRatio = totalCharCount > 0 ? japaneseCharCount / totalCharCount : 0;

            // Filter for valid Japanese sentences
            if (japaneseRatio > 0.3 &&
                line.length > 15 &&
                line.length < 200 &&
                !this.isMetadataLine(line)) {

                let japaneseSentence = line.replace(/<[^>]*>/g, '').trim();

                // Remove furigana from Japanese text
                japaneseSentence = this.removeFurigana(japaneseSentence);

                // Must contain the grammar point and look like a real sentence
                if (japaneseSentence.includes(grammarPoint) &&
                    (japaneseSentence.includes('。') || japaneseSentence.includes('、') || japaneseSentence.length > 25)) {

                    // Look for English translation in next lines
                    for (let j = i + 1; j < Math.min(i + 4, lines.length); j++) {
                        const nextLine = lines[j];

                        if (this.isEnglishTranslation(nextLine)) {
                            let englishTranslation = nextLine.replace(/<[^>]*>/g, '').trim();

                            if (englishTranslation && !englishTranslation.includes('get access to')) {
                                sentencePairs.push({
                                    japanese: japaneseSentence,
                                    english: englishTranslation
                                });
                                break;
                            }
                        }
                    }
                }
            }
        }

        // Create Anki cards from sentence pairs
        sentencePairs.forEach((pair, index) => {
            // Highlight the grammar point in the Japanese sentence
            let frontText = pair.japanese;
            if (grammarPoint && frontText.includes(grammarPoint)) {
                frontText = frontText.replace(
                    new RegExp(grammarPoint.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g'),
                    `<strong>${grammarPoint}</strong>`
                );
            }

            // Create the back with English translation and grammar info
            const cleanGrammarPoint = this.removeFurigana(grammarPoint);
            const grammarInfo = `<a lang="ja" href="${grammarUrl}">${cleanGrammarPoint}</a>${grammarMeaning ? ', ' + grammarMeaning : ''}`;

            cards.push({
                front: `<p>${frontText}</p>`,
                back: `<p>${pair.english}</p>\n<p>${grammarInfo}</p>`
            });
        });

        return cards;
    }

    // Helper function to remove furigana from Japanese text
    removeFurigana(text) {
        // Remove various furigana patterns:
        // 如（ごと）く -> 如く
        // １（いっ）週間 -> １週間
        // Ａ（エー） -> Ａ
        // 母親（ははおや） -> 母親
        return text
            // Remove readings in full-width parentheses (most common)
            .replace(/([一-龯々ａ-ｚＡ-Ｚ０-９]+)（[\u3040-\u309f\u30a0-\u30ff]+）/g, '$1')
            // Remove readings in half-width parentheses
            .replace(/([一-龯々ａ-ｚＡ-Ｚ０-９]+)\([\u3040-\u309f\u30a0-\u30ff]+\)/g, '$1')
            // Handle any remaining patterns with numbers/letters
            .replace(/([０-９ａ-ｚＡ-Ｚ]+)（[\u3040-\u309f\u30a0-\u30ff]+）/g, '$1')
            .replace(/([０-９ａ-ｚＡ-Ｚ]+)\([\u3040-\u309f\u30a0-\u30ff]+\)/g, '$1');
    }

    // Helper function to identify metadata lines
    isMetadataLine(line) {
        const metadataKeywords = [
            'href', 'class', 'Structure', 'Details', 'Register', 'About',
            'Examples', '--:--', 'Premium', 'Self-Study', 'Online', 'Offline',
            'Grammar Discussion', 'Most Recent', 'Join', 'Bunpro tracks'
        ];
        return metadataKeywords.some(keyword => line.includes(keyword));
    }

    // Helper function to identify English translations
    isEnglishTranslation(line) {
        const englishCharCount = (line.match(/[a-zA-Z]/g) || []).length;
        const totalCharCount = line.replace(/\s/g, '').length;
        const englishRatio = totalCharCount > 0 ? englishCharCount / totalCharCount : 0;

        return englishRatio > 0.5 &&
               line.length > 10 &&
               line.length < 300 &&
               !line.includes('<') &&
               !line.includes('http') &&
               !this.isMetadataLine(line);
    }

    // Format cards for Anki import using proper quoting for multi-line cards
    formatCardsForAnki(cards) {
        return cards.map(card => {
            // Transform all quotes to double quotes and escape existing double quotes
            const front = card.front.replace(/"/g, '""');
            const back = card.back.replace(/"/g, '""');

            // Put quotes around the front and back to allow multi-line content
            return `"${front}"\t"${back}"`;
        }).join('\n');
    }

    // Main function to process all grammar points
    async generateAllCards() {
        console.log('Starting Bunpro N1 Anki card generation...');

        try {
            // Step 1: Fetch the main deck page
            console.log('Fetching main deck page...');
            const deckResponse = await fetch(this.deckUrl);
            const deckHtml = await deckResponse.text();

            // Step 2: Extract all grammar point URLs using HTML parsing
            console.log('Extracting grammar points from HTML...');
            const grammarPoints = await this.extractGrammarPointUrls(deckHtml);

            if (grammarPoints.length === 0) {
                throw new Error('No grammar points found in deck page');
            }

            // Step 3: Process grammar points
            const pointsToProcess = grammarPoints;

            console.log(`Found ${grammarPoints.length} total grammar points`);
            console.log(`Processing first ${pointsToProcess.length} grammar points...`);
            console.log('Grammar points found:', grammarPoints.map(p => p.name).join(', '));

            for (const [index, grammarPoint] of pointsToProcess.entries()) {
                console.log(`\nProcessing ${index + 1}/${pointsToProcess.length}: ${grammarPoint.name}`);

                try {
                    // Fetch the individual grammar point page
                    const response = await fetch(grammarPoint.url);
                    if (!response.ok) {
                        throw new Error(`HTTP ${response.status}`);
                    }

                    const content = await response.text();

                    // Parse and create cards
                    const cards = this.parseGrammarPointPage(
                        content,
                        grammarPoint.name,
                        grammarPoint.url,
                        grammarPoint.description
                    );

                    this.allCards.push(...cards);
                    console.log(`✓ Generated ${cards.length} cards for ${grammarPoint.name}`);

                    // Add delay to be respectful to the server
                    if (index < pointsToProcess.length - 1) {
                        await new Promise(resolve => setTimeout(resolve, 1000));
                    }

                } catch (error) {
                    console.error(`✗ Failed to process ${grammarPoint.name}:`, error.message);

                    // Add a sample card even if parsing fails
                    const cleanGrammarPoint = this.removeFurigana(grammarPoint.name);
                    this.allCards.push({
                        front: `<p>${cleanGrammarPoint}</p>`,
                        back: `<p>${grammarPoint.description}</p>\n<p><a lang="ja" href="${grammarPoint.url}">${cleanGrammarPoint}</a></p>`
                    });
                }
            }

        } catch (error) {
            console.error('Error in main generation process:', error.message);

            // Fallback to sample data if web scraping fails
            console.log('Falling back to sample data...');
            const sampleCards = this.parseGrammarPointPage(
                this.getSampleContent(),
                'に至るまで',
                'https://bunpro.jp/grammar_points/%E3%81%AB%E8%87%B3%E3%82%8B%E3%81%BE%E3%81%A7',
                '(Everything) from...to..., From…up until..., Starting with...ending with...'
            );
            this.allCards.push(...sampleCards);
        }

        return this.allCards;
    }

    // Sample content for fallback
    getSampleContent() {
        return `Examples
 にしまさんはバーテンダーから大企業の社長に至るまでいろいろな仕事を歴任した。
 Nishima-san held various positions, from a bartender to a president of a large company.
 今日では、ＣＰＵはパソコンから歯ブラシに至るまで、さまざまな装置に見られる。
 Nowadays, CPUs can be found in various kinds of devices, from computers to toothbrushes, etc.
 総理大臣から一般人に至るまであらゆる市民は法のもとで平等である。
 All citizens are equal under the law, from the prime minister to an ordinary person.
 航海の初めから事故そのものに至るまでの難破船の航海日誌が見つかった。
 A shipwreck's logbook covering everything from the beginning of the voyage to the accident itself has been found.
 アジアでは、箸は昔より現在に至るまで長きに渡って、食べるために使われている。
 In Asia, chopsticks have been used to eat for a long time, from long ago to the present.`;
    }

    // Generate the final Anki import file
    async generateAnkiFile() {
        const cards = await this.generateAllCards();
        const ankiFormat = this.formatCardsForAnki(cards);

        // Write to file
        const filename = 'bunpro_n1_cards.txt';
        fs.writeFileSync(filename, ankiFormat);

        console.log('\n=== ANKI CARDS GENERATED ===');
        console.log(`✅ Successfully created ${filename} with ${cards.length} cards`);
        console.log('📁 File location: ' + process.cwd() + '/' + filename);
        console.log('\n📝 To import into Anki:');
        console.log('1. Open Anki and go to File > Import');
        console.log('2. Select the bunpro_n1_cards.txt file');
        console.log('3. Set field separator to "Tab"');
        console.log('4. Enable "Allow HTML in fields"');
        console.log('5. Map fields: Field 1 → Front, Field 2 → Back');
        console.log('6. Click Import');

        return ankiFormat;
    }
 }

 // Usage example
 async function main() {
    const generator = new BunproAnkiGenerator();
    await generator.generateAnkiFile();

    console.log('\n⚙️ Configuration Notes:');
    console.log('- Currently processes first 10 grammar points (configurable in generateAllCards)');
    console.log('- Change maxPointsToProcess variable to process more/all grammar points');
    console.log('- Includes 1-second delays between requests to be respectful to server');
    console.log('- Uses JSON data extraction for maximum accuracy');
    console.log('- Removes furigana from Japanese text for cleaner cards');
    console.log('- Supports multi-line cards with proper quoting');
    console.log('- No hard-coded grammar points - extracts everything dynamically');
 }

 // 🎉 ENHANCED BUNPRO ANKI GENERATOR
 // The script now includes advanced features:
 // 1. Fetches the Bunpro N1 deck page and extracts ALL grammar points dynamically
 // 2. Uses multiple HTML parsing methods with comprehensive fallbacks
 // 3. For each grammar point page, extracts data from __NEXT_DATA__ JSON structure
 // 4. Gets clean Japanese sentences and English translations from studyQuestions array
 // 5. Properly handles cloze deletions and replaces placeholders with grammar points
 // 6. Filters out writeup-only content and focuses on example sentences
 // 7. Creates properly formatted Anki cards with highlighted grammar points
 // 8. Removes furigana from Japanese text for cleaner, more readable cards
 // 9. Supports multi-line cards with proper CSV quoting (no <br> tags needed)
 // 10. Writes output directly to bunpro_n1_cards.txt file using Node.js fs module
 // 11. Includes comprehensive error handling with HTML fallback parsing
 // 12. No hard-coded content - everything is extracted dynamically from live website

 // Key Features:
 // - Dynamic Grammar Point Discovery: Finds ALL points on the page automatically
 // - JSON Data Extraction: Uses __NEXT_DATA__ for clean, structured sentence data
 // - Furigana Removal: Converts 母親（ははおや）-> 母親 for cleaner cards
 // - Multi-line Support: Uses proper CSV quoting instead of <br> tags
 // - File Output: Automatically creates bunpro_n1_cards.txt ready for Anki import
 // - Error Handling: Multiple fallback methods ensure reliable operation

 // JSON Data Structure Parsed:
 // - props.pageProps.included.studyQuestions[] contains example sentences
 // - Each question has: content (Japanese), translation (English), answer (grammar point)
 // - Automatically replaces <span class='study-area-input'>____</span> with correct answer
 // - Filters by question_type and used_in to get the best example sentences

 // To process ALL grammar points found on the page:
 // 1. Increase maxPointsToProcess in generateAllCards() or remove the limit entirely
 // 2. Adjust delay between requests if needed (currently 1 second)
 // 3. The script will automatically find and process every grammar point on the page
 // 4. Each grammar point typically yields 5-10 high-quality example sentence cards

 main();
	// Bunpro N1 Grammar Anki Card Generator
	// This script extracts grammar points and example sentences from Bunpro N1 deck
	// and formats them for Anki import

	// Written by Claude Code. Don't judge me on the code quality.

	const fs = require('fs');

	class BunproAnkiGenerator {
	constructor() {
	this.baseUrl = 'https://bunpro.jp';
	this.deckUrl = 'https://bunpro.jp/decks/lzqrdc/Bunpro-N1-Grammar';
	this.allCards = [];
	}

	// Function to extract grammar point URLs from the main deck page
	async extractGrammarPointUrls(deckHtml) {
	console.log('Parsing HTML for grammar points...');

	// Method 1: Parse HTML structure for grammar point cards
	const grammarPoints = this.parseGrammarPointCardsFromHTML(deckHtml);

	if (grammarPoints.length > 0) {
	console.log(`Successfully extracted ${grammarPoints.length} grammar points from HTML`);
	return grammarPoints;
	}

	// Method 2: Fallback - look for any grammar_points links
	console.log('Primary parsing failed, trying fallback method...');
	const fallbackPoints = this.extractGrammarPointLinksRegex(deckHtml);

	if (fallbackPoints.length > 0) {
	console.log(`Fallback method found ${fallbackPoints.length} grammar points`);
	return fallbackPoints;
	}

	// Method 3: Last resort - use known N1 grammar points
	console.log('HTML parsing failed, using known N1 grammar points as fallback...');
	return this.getKnownN1GrammarPoints();
	}

	// Parse grammar point cards from HTML using the structure you provided
	parseGrammarPointCardsFromHTML(html) {
	const grammarPoints = [];

	// Find all div elements with the grammar point card classes
	const cardRegex = /<div[^>]js_decks-card_info[^>]deck-info-card[^>]>(.?)<\/div>(?:\s<\/div>)/gs;
	const cardMatches = html.matchAll(cardRegex);

	for (const cardMatch of cardMatches) {
	const cardHTML = cardMatch[1];

	// Extract the href from the anchor tag
	const hrefRegex = /<a[^>]href="([^"]grammar_points[^"])"[^>]>/;
	const hrefMatch = cardHTML.match(hrefRegex);

	// Extract the grammar point name from the title element
	const titleRegex = /<p[^>](?:deck-card-title\|v-text_large--400)[^>]>(.*?)<\/p>/;
	const titleMatch = cardHTML.match(titleRegex);

	// Extract the description from the span element
	const descRegex = /<span[^>]u-text_body--400[^>]u-text_fg-secondary[^>]>(.?)<\/span>/;
	const descMatch = cardHTML.match(descRegex);

	if (hrefMatch && titleMatch) {
	const href = hrefMatch[1];
	const name = titleMatch[1].trim();
	const description = descMatch ? descMatch[1].trim() : '';
	const fullUrl = href.startsWith('http') ? href : `https://bunpro.jp${href}`;

	grammarPoints.push({
	name: name,
	url: fullUrl,
	description: description
	});
	}
	}

	return grammarPoints;
	}

	// Fallback method: Extract any grammar_points links from HTML
	extractGrammarPointLinksRegex(html) {
	const grammarPoints = [];

	// Find all links to grammar_points
	const linkRegex = /<a[^>]href="([^"]\/grammar_points\/[^"])"[^>]>(.*?)<\/a>/gs;
	const linkMatches = html.matchAll(linkRegex);

	for (const linkMatch of linkMatches) {
	const href = linkMatch[1];
	const linkHTML = linkMatch[0];

	// Try to extract title from various possible patterns
	const titlePatterns = [
	/<p[^>](?:deck-card-title\|v-text_large--400)[^>]>(.*?)<\/p>/,
	/<div[^>]title[^>]>(.*?)<\/div>/,
	/>([\u3040-\u309f\u30a0-\u30ff\u4e00-\u9faf\s\+\(\)（）・～ー]+)</ // Japanese characters
	];

	let name = null;
	for (const pattern of titlePatterns) {
	const match = linkHTML.match(pattern);
	if (match) {
	name = match[1].trim();
	break;
	}
	}

	// Try to extract description
	const descPattern = /<span[^>]u-text_body--400[^>]>(.*?)<\/span>/;
	const descMatch = linkHTML.match(descPattern);
	const description = descMatch ? descMatch[1].trim() : '';

	if (name && href) {
	const fullUrl = href.startsWith('http') ? href : `https://bunpro.jp${href}`;

	grammarPoints.push({
	name: name,
	url: fullUrl,
	description: description
	});
	}
	}

	// Remove duplicates
	const uniquePoints = grammarPoints.filter((point, index, self) =>
	index === self.findIndex(p => p.url === point.url)
	);

	return uniquePoints;
	}

	// Last resort: Known N1 grammar points (minimal list for absolute fallback)
	getKnownN1GrammarPoints() {
	return [
	{ name: 'に至るまで', url: 'https://bunpro.jp/grammar_points/%E3%81%AB%E8%87%B3%E3%82%8B%E3%81%BE%E3%81%A7', description: '(Everything) from...to...' },
	{ name: 'ならまだしも', url: 'https://bunpro.jp/grammar_points/%E3%81%AA%E3%82%89%E3%81%BE%E3%81%A0%E3%81%97%E3%82%82', description: 'A is fine...but B is not' },
	{ name: 'とあって', url: 'https://bunpro.jp/grammar_points/%E3%81%A8%E3%81%82%E3%81%A3%E3%81%A6', description: 'Since...(it is no wonder that)' }
	];
	}

	// Function to parse individual grammar point pages using JSON data
	parseGrammarPointPage(htmlContent, grammarPoint, grammarUrl, grammarMeaning = '') {
	const cards = [];

	try {
	// First, try to extract data from __NEXT_DATA__ JSON
	const jsonCards = this.parseGrammarPointFromJSON(htmlContent, grammarPoint, grammarUrl, grammarMeaning);
	if (jsonCards.length > 0) {
	console.log(`Extracted ${jsonCards.length} cards from JSON data`);
	return jsonCards;
	}
	} catch (error) {
	console.log('JSON parsing failed, falling back to HTML parsing:', error.message);
	}

	// Fallback to original HTML parsing method
	return this.parseGrammarPointFromHTML(htmlContent, grammarPoint, grammarUrl, grammarMeaning);
	}

	// New method: Parse grammar point data from __NEXT_DATA__ JSON
	parseGrammarPointFromJSON(htmlContent, grammarPoint, grammarUrl, grammarMeaning) {
	const cards = [];

	// Extract JSON data from __NEXT_DATA__ script tag
	const jsonMatch = htmlContent.match(/<script id="__NEXT_DATA__" type="application\/json">(.*?)<\/script>/s);
	if (!jsonMatch) {
	throw new Error('No __NEXT_DATA__ found');
	}

	const jsonData = JSON.parse(jsonMatch[1]);
	const studyQuestions = jsonData?.props?.pageProps?.included?.studyQuestions \|\| [];

	if (studyQuestions.length === 0) {
	throw new Error('No study questions found in JSON');
	}

	studyQuestions.forEach((question, index) => {
	try {
	// Skip readonly questions, focus on examples and cloze questions
	if (question.question_type === 'readonly' && question.used_in === 'writeups') {
	return;
	}

	const content = question.content \|\| '';
	const translation = question.translation \|\| '';
	const answer = question.answer \|\| grammarPoint;

	if (content && translation) {
	// Clean up the Japanese sentence and remove furigana
	let japaneseSentence = content
	.replace(/<span[^>]study-area-input[^>]>.*?<\/span>/g, answer) // Replace input with answer
	.replace(/<span[^>]name-highlight[^>]>(.*?)<\/span>/g, '$1') // Remove name highlighting
	.replace(/<[^>]*>/g, '') // Remove all other HTML tags
	.replace(/\r?\n/g, '') // Remove newlines
	.trim();

	// Remove furigana from Japanese text
	japaneseSentence = this.removeFurigana(japaneseSentence);

	// Clean up the English translation
	let englishTranslation = translation
	.replace(/<span[^>]name-highlight[^>]>(.*?)<\/span>/g, '$1') // Remove name highlighting
	.replace(/<strong>/g, '').replace(/<\/strong>/g, '') // Convert strong tags temporarily
	.replace(/<[^>]*>/g, '') // Remove all HTML tags
	.replace(/\\/g, '') // Remove the temporary markers
	.replace(/\r?\n/g, ' ') // Replace newlines with spaces
	.trim();

	// Skip if either sentence is too short or doesn't contain the grammar point
	if (japaneseSentence.length < 10 \|\| englishTranslation.length < 10) {
	return;
	}

	// Highlight the grammar point in the Japanese sentence
	let frontText = japaneseSentence;
	if (answer && frontText.includes(answer)) {
	frontText = frontText.replace(
	new RegExp(answer.replace(/[.*+?^${}()\|[\]\\]/g, '\\$&'), 'g'),
	`<strong>${answer}</strong>`
	);
	}

	// Create the back with English translation and grammar info
	const cleanGrammarPoint = this.removeFurigana(grammarPoint);
	const grammarInfo = `<a lang="ja" href="${grammarUrl}">${cleanGrammarPoint}</a>${grammarMeaning ? ', ' + grammarMeaning : ''}`;

	cards.push({
	front: `<p>${frontText}</p>`,
	back: `<p>${englishTranslation}</p>\n<p>${grammarInfo}</p>`
	});
	}
	} catch (error) {
	console.error(`Error processing question ${index}:`, error);
	}
	});

	return cards;
	}

	// Original HTML parsing method (fallback)
	parseGrammarPointFromHTML(htmlContent, grammarPoint, grammarUrl, grammarMeaning) {
	const cards = [];
	const lines = htmlContent.split('\n').map(line => line.trim()).filter(line => line);
	const sentencePairs = [];

	for (let i = 0; i < lines.length; i++) {
	const line = lines[i];

	// Detect Japanese sentences
	const japaneseCharCount = (line.match(/[\u3040-\u309f\u30a0-\u30ff\u4e00-\u9faf]/g) \|\| []).length;
	const totalCharCount = line.replace(/\s/g, '').length;
	const japaneseRatio = totalCharCount > 0 ? japaneseCharCount / totalCharCount : 0;

	// Filter for valid Japanese sentences
	if (japaneseRatio > 0.3 &&
	line.length > 15 &&
	line.length < 200 &&
	!this.isMetadataLine(line)) {

	let japaneseSentence = line.replace(/<[^>]*>/g, '').trim();

	// Remove furigana from Japanese text
	japaneseSentence = this.removeFurigana(japaneseSentence);

	// Must contain the grammar point and look like a real sentence
	if (japaneseSentence.includes(grammarPoint) &&
	(japaneseSentence.includes('。') \|\| japaneseSentence.includes('、') \|\| japaneseSentence.length > 25)) {

	// Look for English translation in next lines
	for (let j = i + 1; j < Math.min(i + 4, lines.length); j++) {
	const nextLine = lines[j];

	if (this.isEnglishTranslation(nextLine)) {
	let englishTranslation = nextLine.replace(/<[^>]*>/g, '').trim();

	if (englishTranslation && !englishTranslation.includes('get access to')) {
	sentencePairs.push({
	japanese: japaneseSentence,
	english: englishTranslation
	});
	break;
	}
	}
	}
	}
	}
	}

	// Create Anki cards from sentence pairs
	sentencePairs.forEach((pair, index) => {
	// Highlight the grammar point in the Japanese sentence
	let frontText = pair.japanese;
	if (grammarPoint && frontText.includes(grammarPoint)) {
	frontText = frontText.replace(
	new RegExp(grammarPoint.replace(/[.*+?^${}()\|[\]\\]/g, '\\$&'), 'g'),
	`<strong>${grammarPoint}</strong>`
	);
	}

	// Create the back with English translation and grammar info
	const cleanGrammarPoint = this.removeFurigana(grammarPoint);
	const grammarInfo = `<a lang="ja" href="${grammarUrl}">${cleanGrammarPoint}</a>${grammarMeaning ? ', ' + grammarMeaning : ''}`;

	cards.push({
	front: `<p>${frontText}</p>`,
	back: `<p>${pair.english}</p>\n<p>${grammarInfo}</p>`
	});
	});

	return cards;
	}

	// Helper function to remove furigana from Japanese text
	removeFurigana(text) {
	// Remove various furigana patterns:
	// 如（ごと）く -> 如く
	// １（いっ）週間 -> １週間
	// Ａ（エー） -> Ａ
	// 母親（ははおや） -> 母親
	return text
	// Remove readings in full-width parentheses (most common)
	.replace(/([一-龯々ａ-ｚＡ-Ｚ０-９]+)（[\u3040-\u309f\u30a0-\u30ff]+）/g, '$1')
	// Remove readings in half-width parentheses
	.replace(/([一-龯々ａ-ｚＡ-Ｚ０-９]+)\([\u3040-\u309f\u30a0-\u30ff]+\)/g, '$1')
	// Handle any remaining patterns with numbers/letters
	.replace(/([０-９ａ-ｚＡ-Ｚ]+)（[\u3040-\u309f\u30a0-\u30ff]+）/g, '$1')
	.replace(/([０-９ａ-ｚＡ-Ｚ]+)\([\u3040-\u309f\u30a0-\u30ff]+\)/g, '$1');
	}

	// Helper function to identify metadata lines
	isMetadataLine(line) {
	const metadataKeywords = [
	'href', 'class', 'Structure', 'Details', 'Register', 'About',
	'Examples', '--:--', 'Premium', 'Self-Study', 'Online', 'Offline',
	'Grammar Discussion', 'Most Recent', 'Join', 'Bunpro tracks'
	];
	return metadataKeywords.some(keyword => line.includes(keyword));
	}

	// Helper function to identify English translations
	isEnglishTranslation(line) {
	const englishCharCount = (line.match(/[a-zA-Z]/g) \|\| []).length;
	const totalCharCount = line.replace(/\s/g, '').length;
	const englishRatio = totalCharCount > 0 ? englishCharCount / totalCharCount : 0;

	return englishRatio > 0.5 &&
	line.length > 10 &&
	line.length < 300 &&
	!line.includes('<') &&
	!line.includes('http') &&
	!this.isMetadataLine(line);
	}

	// Format cards for Anki import using proper quoting for multi-line cards
	formatCardsForAnki(cards) {
	return cards.map(card => {
	// Transform all quotes to double quotes and escape existing double quotes
	const front = card.front.replace(/"/g, '""');
	const back = card.back.replace(/"/g, '""');

	// Put quotes around the front and back to allow multi-line content
	return `"${front}"\t"${back}"`;
	}).join('\n');
	}

	// Main function to process all grammar points
	async generateAllCards() {
	console.log('Starting Bunpro N1 Anki card generation...');

	try {
	// Step 1: Fetch the main deck page
	console.log('Fetching main deck page...');
	const deckResponse = await fetch(this.deckUrl);
	const deckHtml = await deckResponse.text();

	// Step 2: Extract all grammar point URLs using HTML parsing
	console.log('Extracting grammar points from HTML...');
	const grammarPoints = await this.extractGrammarPointUrls(deckHtml);

	if (grammarPoints.length === 0) {
	throw new Error('No grammar points found in deck page');
	}

	// Step 3: Process grammar points
	const pointsToProcess = grammarPoints;

	console.log(`Found ${grammarPoints.length} total grammar points`);
	console.log(`Processing first ${pointsToProcess.length} grammar points...`);
	console.log('Grammar points found:', grammarPoints.map(p => p.name).join(', '));

	for (const [index, grammarPoint] of pointsToProcess.entries()) {
	console.log(`\nProcessing ${index + 1}/${pointsToProcess.length}: ${grammarPoint.name}`);

	try {
	// Fetch the individual grammar point page
	const response = await fetch(grammarPoint.url);
	if (!response.ok) {
	throw new Error(`HTTP ${response.status}`);
	}

	const content = await response.text();

	// Parse and create cards
	const cards = this.parseGrammarPointPage(
	content,
	grammarPoint.name,
	grammarPoint.url,
	grammarPoint.description
	);

	this.allCards.push(...cards);
	console.log(`✓ Generated ${cards.length} cards for ${grammarPoint.name}`);

	// Add delay to be respectful to the server
	if (index < pointsToProcess.length - 1) {
	await new Promise(resolve => setTimeout(resolve, 1000));
	}

	} catch (error) {
	console.error(`✗ Failed to process ${grammarPoint.name}:`, error.message);

	// Add a sample card even if parsing fails
	const cleanGrammarPoint = this.removeFurigana(grammarPoint.name);
	this.allCards.push({
	front: `<p>${cleanGrammarPoint}</p>`,
	back: `<p>${grammarPoint.description}</p>\n<p><a lang="ja" href="${grammarPoint.url}">${cleanGrammarPoint}</a></p>`
	});
	}
	}

	} catch (error) {
	console.error('Error in main generation process:', error.message);

	// Fallback to sample data if web scraping fails
	console.log('Falling back to sample data...');
	const sampleCards = this.parseGrammarPointPage(
	this.getSampleContent(),
	'に至るまで',
	'https://bunpro.jp/grammar_points/%E3%81%AB%E8%87%B3%E3%82%8B%E3%81%BE%E3%81%A7',
	'(Everything) from...to..., From…up until..., Starting with...ending with...'
	);
	this.allCards.push(...sampleCards);
	}

	return this.allCards;
	}

	// Sample content for fallback
	getSampleContent() {
	return `Examples
	にしまさんはバーテンダーから大企業の社長に至るまでいろいろな仕事を歴任した。
	Nishima-san held various positions, from a bartender to a president of a large company.
	今日では、ＣＰＵはパソコンから歯ブラシに至るまで、さまざまな装置に見られる。
	Nowadays, CPUs can be found in various kinds of devices, from computers to toothbrushes, etc.
	総理大臣から一般人に至るまであらゆる市民は法のもとで平等である。
	All citizens are equal under the law, from the prime minister to an ordinary person.
	航海の初めから事故そのものに至るまでの難破船の航海日誌が見つかった。
	A shipwreck's logbook covering everything from the beginning of the voyage to the accident itself has been found.
	アジアでは、箸は昔より現在に至るまで長きに渡って、食べるために使われている。
	In Asia, chopsticks have been used to eat for a long time, from long ago to the present.`;
	}

	// Generate the final Anki import file
	async generateAnkiFile() {
	const cards = await this.generateAllCards();
	const ankiFormat = this.formatCardsForAnki(cards);

	// Write to file
	const filename = 'bunpro_n1_cards.txt';
	fs.writeFileSync(filename, ankiFormat);

	console.log('\n=== ANKI CARDS GENERATED ===');
	console.log(`✅ Successfully created ${filename} with ${cards.length} cards`);
	console.log('📁 File location: ' + process.cwd() + '/' + filename);
	console.log('\n📝 To import into Anki:');
	console.log('1. Open Anki and go to File > Import');
	console.log('2. Select the bunpro_n1_cards.txt file');
	console.log('3. Set field separator to "Tab"');
	console.log('4. Enable "Allow HTML in fields"');
	console.log('5. Map fields: Field 1 → Front, Field 2 → Back');
	console.log('6. Click Import');

	return ankiFormat;
	}
	}

	// Usage example
	async function main() {
	const generator = new BunproAnkiGenerator();
	await generator.generateAnkiFile();

	console.log('\n⚙️ Configuration Notes:');
	console.log('- Currently processes first 10 grammar points (configurable in generateAllCards)');
	console.log('- Change maxPointsToProcess variable to process more/all grammar points');
	console.log('- Includes 1-second delays between requests to be respectful to server');
	console.log('- Uses JSON data extraction for maximum accuracy');
	console.log('- Removes furigana from Japanese text for cleaner cards');
	console.log('- Supports multi-line cards with proper quoting');
	console.log('- No hard-coded grammar points - extracts everything dynamically');
	}

	// 🎉 ENHANCED BUNPRO ANKI GENERATOR
	// The script now includes advanced features:
	// 1. Fetches the Bunpro N1 deck page and extracts ALL grammar points dynamically
	// 2. Uses multiple HTML parsing methods with comprehensive fallbacks
	// 3. For each grammar point page, extracts data from __NEXT_DATA__ JSON structure
	// 4. Gets clean Japanese sentences and English translations from studyQuestions array
	// 5. Properly handles cloze deletions and replaces placeholders with grammar points
	// 6. Filters out writeup-only content and focuses on example sentences
	// 7. Creates properly formatted Anki cards with highlighted grammar points
	// 8. Removes furigana from Japanese text for cleaner, more readable cards
	// 9. Supports multi-line cards with proper CSV quoting (no <br> tags needed)
	// 10. Writes output directly to bunpro_n1_cards.txt file using Node.js fs module
	// 11. Includes comprehensive error handling with HTML fallback parsing
	// 12. No hard-coded content - everything is extracted dynamically from live website

	// Key Features:
	// - Dynamic Grammar Point Discovery: Finds ALL points on the page automatically
	// - JSON Data Extraction: Uses __NEXT_DATA__ for clean, structured sentence data
	// - Furigana Removal: Converts 母親（ははおや）-> 母親 for cleaner cards
	// - Multi-line Support: Uses proper CSV quoting instead of <br> tags
	// - File Output: Automatically creates bunpro_n1_cards.txt ready for Anki import
	// - Error Handling: Multiple fallback methods ensure reliable operation

	// JSON Data Structure Parsed:
	// - props.pageProps.included.studyQuestions[] contains example sentences
	// - Each question has: content (Japanese), translation (English), answer (grammar point)
	// - Automatically replaces <span class='study-area-input'>____</span> with correct answer
	// - Filters by question_type and used_in to get the best example sentences

	// To process ALL grammar points found on the page:
	// 1. Increase maxPointsToProcess in generateAllCards() or remove the limit entirely
	// 2. Adjust delay between requests if needed (currently 1 second)
	// 3. The script will automatically find and process every grammar point on the page
	// 4. Each grammar point typically yields 5-10 high-quality example sentence cards

	main();