Skip to content

Instantly share code, notes, and snippets.

@domenic
Last active September 24, 2025 04:33
Show Gist options
  • Save domenic/f202e2fcd9a89a78ef3003174ae257b2 to your computer and use it in GitHub Desktop.
Save domenic/f202e2fcd9a89a78ef3003174ae257b2 to your computer and use it in GitHub Desktop.
Bunpro Anki scraper
// Bunpro N1 Grammar Anki Card Generator
// This script extracts grammar points and example sentences from Bunpro N1 deck
// and formats them for Anki import
// Written by Claude Code. Don't judge me on the code quality.
const fs = require('fs');
class BunproAnkiGenerator {
constructor() {
this.baseUrl = 'https://bunpro.jp';
this.deckUrl = 'https://bunpro.jp/decks/lzqrdc/Bunpro-N1-Grammar';
this.allCards = [];
}
// Function to extract grammar point URLs from the main deck page
async extractGrammarPointUrls(deckHtml) {
console.log('Parsing HTML for grammar points...');
// Method 1: Parse HTML structure for grammar point cards
const grammarPoints = this.parseGrammarPointCardsFromHTML(deckHtml);
if (grammarPoints.length > 0) {
console.log(`Successfully extracted ${grammarPoints.length} grammar points from HTML`);
return grammarPoints;
}
// Method 2: Fallback - look for any grammar_points links
console.log('Primary parsing failed, trying fallback method...');
const fallbackPoints = this.extractGrammarPointLinksRegex(deckHtml);
if (fallbackPoints.length > 0) {
console.log(`Fallback method found ${fallbackPoints.length} grammar points`);
return fallbackPoints;
}
// Method 3: Last resort - use known N1 grammar points
console.log('HTML parsing failed, using known N1 grammar points as fallback...');
return this.getKnownN1GrammarPoints();
}
// Parse grammar point cards from HTML using the structure you provided
parseGrammarPointCardsFromHTML(html) {
const grammarPoints = [];
// Find all div elements with the grammar point card classes
const cardRegex = /<div[^>]*js_decks-card_info[^>]*deck-info-card[^>]*>(.*?)<\/div>(?:\s*<\/div>)*/gs;
const cardMatches = html.matchAll(cardRegex);
for (const cardMatch of cardMatches) {
const cardHTML = cardMatch[1];
// Extract the href from the anchor tag
const hrefRegex = /<a[^>]*href="([^"]*grammar_points[^"]*)"[^>]*>/;
const hrefMatch = cardHTML.match(hrefRegex);
// Extract the grammar point name from the title element
const titleRegex = /<p[^>]*(?:deck-card-title|v-text_large--400)[^>]*>(.*?)<\/p>/;
const titleMatch = cardHTML.match(titleRegex);
// Extract the description from the span element
const descRegex = /<span[^>]*u-text_body--400[^>]*u-text_fg-secondary[^>]*>(.*?)<\/span>/;
const descMatch = cardHTML.match(descRegex);
if (hrefMatch && titleMatch) {
const href = hrefMatch[1];
const name = titleMatch[1].trim();
const description = descMatch ? descMatch[1].trim() : '';
const fullUrl = href.startsWith('http') ? href : `https://bunpro.jp${href}`;
grammarPoints.push({
name: name,
url: fullUrl,
description: description
});
}
}
return grammarPoints;
}
// Fallback method: Extract any grammar_points links from HTML
extractGrammarPointLinksRegex(html) {
const grammarPoints = [];
// Find all links to grammar_points
const linkRegex = /<a[^>]*href="([^"]*\/grammar_points\/[^"]*)"[^>]*>(.*?)<\/a>/gs;
const linkMatches = html.matchAll(linkRegex);
for (const linkMatch of linkMatches) {
const href = linkMatch[1];
const linkHTML = linkMatch[0];
// Try to extract title from various possible patterns
const titlePatterns = [
/<p[^>]*(?:deck-card-title|v-text_large--400)[^>]*>(.*?)<\/p>/,
/<div[^>]*title[^>]*>(.*?)<\/div>/,
/>([\u3040-\u309f\u30a0-\u30ff\u4e00-\u9faf\s\+\(\)()・~ー]+)</ // Japanese characters
];
let name = null;
for (const pattern of titlePatterns) {
const match = linkHTML.match(pattern);
if (match) {
name = match[1].trim();
break;
}
}
// Try to extract description
const descPattern = /<span[^>]*u-text_body--400[^>]*>(.*?)<\/span>/;
const descMatch = linkHTML.match(descPattern);
const description = descMatch ? descMatch[1].trim() : '';
if (name && href) {
const fullUrl = href.startsWith('http') ? href : `https://bunpro.jp${href}`;
grammarPoints.push({
name: name,
url: fullUrl,
description: description
});
}
}
// Remove duplicates
const uniquePoints = grammarPoints.filter((point, index, self) =>
index === self.findIndex(p => p.url === point.url)
);
return uniquePoints;
}
// Last resort: Known N1 grammar points (minimal list for absolute fallback)
getKnownN1GrammarPoints() {
return [
{ name: 'に至るまで', url: 'https://bunpro.jp/grammar_points/%E3%81%AB%E8%87%B3%E3%82%8B%E3%81%BE%E3%81%A7', description: '(Everything) from...to...' },
{ name: 'ならまだしも', url: 'https://bunpro.jp/grammar_points/%E3%81%AA%E3%82%89%E3%81%BE%E3%81%A0%E3%81%97%E3%82%82', description: 'A is fine...but B is not' },
{ name: 'とあって', url: 'https://bunpro.jp/grammar_points/%E3%81%A8%E3%81%82%E3%81%A3%E3%81%A6', description: 'Since...(it is no wonder that)' }
];
}
// Function to parse individual grammar point pages using JSON data
parseGrammarPointPage(htmlContent, grammarPoint, grammarUrl, grammarMeaning = '') {
const cards = [];
try {
// First, try to extract data from __NEXT_DATA__ JSON
const jsonCards = this.parseGrammarPointFromJSON(htmlContent, grammarPoint, grammarUrl, grammarMeaning);
if (jsonCards.length > 0) {
console.log(`Extracted ${jsonCards.length} cards from JSON data`);
return jsonCards;
}
} catch (error) {
console.log('JSON parsing failed, falling back to HTML parsing:', error.message);
}
// Fallback to original HTML parsing method
return this.parseGrammarPointFromHTML(htmlContent, grammarPoint, grammarUrl, grammarMeaning);
}
// New method: Parse grammar point data from __NEXT_DATA__ JSON
parseGrammarPointFromJSON(htmlContent, grammarPoint, grammarUrl, grammarMeaning) {
const cards = [];
// Extract JSON data from __NEXT_DATA__ script tag
const jsonMatch = htmlContent.match(/<script id="__NEXT_DATA__" type="application\/json">(.*?)<\/script>/s);
if (!jsonMatch) {
throw new Error('No __NEXT_DATA__ found');
}
const jsonData = JSON.parse(jsonMatch[1]);
const studyQuestions = jsonData?.props?.pageProps?.included?.studyQuestions || [];
if (studyQuestions.length === 0) {
throw new Error('No study questions found in JSON');
}
studyQuestions.forEach((question, index) => {
try {
// Skip readonly questions, focus on examples and cloze questions
if (question.question_type === 'readonly' && question.used_in === 'writeups') {
return;
}
const content = question.content || '';
const translation = question.translation || '';
const answer = question.answer || grammarPoint;
if (content && translation) {
// Clean up the Japanese sentence and remove furigana
let japaneseSentence = content
.replace(/<span[^>]*study-area-input[^>]*>.*?<\/span>/g, answer) // Replace input with answer
.replace(/<span[^>]*name-highlight[^>]*>(.*?)<\/span>/g, '$1') // Remove name highlighting
.replace(/<[^>]*>/g, '') // Remove all other HTML tags
.replace(/\r?\n/g, '') // Remove newlines
.trim();
// Remove furigana from Japanese text
japaneseSentence = this.removeFurigana(japaneseSentence);
// Clean up the English translation
let englishTranslation = translation
.replace(/<span[^>]*name-highlight[^>]*>(.*?)<\/span>/g, '$1') // Remove name highlighting
.replace(/<strong>/g, '**').replace(/<\/strong>/g, '**') // Convert strong tags temporarily
.replace(/<[^>]*>/g, '') // Remove all HTML tags
.replace(/\*\*/g, '') // Remove the temporary markers
.replace(/\r?\n/g, ' ') // Replace newlines with spaces
.trim();
// Skip if either sentence is too short or doesn't contain the grammar point
if (japaneseSentence.length < 10 || englishTranslation.length < 10) {
return;
}
// Highlight the grammar point in the Japanese sentence
let frontText = japaneseSentence;
if (answer && frontText.includes(answer)) {
frontText = frontText.replace(
new RegExp(answer.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g'),
`<strong>${answer}</strong>`
);
}
// Create the back with English translation and grammar info
const cleanGrammarPoint = this.removeFurigana(grammarPoint);
const grammarInfo = `<a lang="ja" href="${grammarUrl}">${cleanGrammarPoint}</a>${grammarMeaning ? ', ' + grammarMeaning : ''}`;
cards.push({
front: `<p>${frontText}</p>`,
back: `<p>${englishTranslation}</p>\n<p>${grammarInfo}</p>`
});
}
} catch (error) {
console.error(`Error processing question ${index}:`, error);
}
});
return cards;
}
// Original HTML parsing method (fallback)
parseGrammarPointFromHTML(htmlContent, grammarPoint, grammarUrl, grammarMeaning) {
const cards = [];
const lines = htmlContent.split('\n').map(line => line.trim()).filter(line => line);
const sentencePairs = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
// Detect Japanese sentences
const japaneseCharCount = (line.match(/[\u3040-\u309f\u30a0-\u30ff\u4e00-\u9faf]/g) || []).length;
const totalCharCount = line.replace(/\s/g, '').length;
const japaneseRatio = totalCharCount > 0 ? japaneseCharCount / totalCharCount : 0;
// Filter for valid Japanese sentences
if (japaneseRatio > 0.3 &&
line.length > 15 &&
line.length < 200 &&
!this.isMetadataLine(line)) {
let japaneseSentence = line.replace(/<[^>]*>/g, '').trim();
// Remove furigana from Japanese text
japaneseSentence = this.removeFurigana(japaneseSentence);
// Must contain the grammar point and look like a real sentence
if (japaneseSentence.includes(grammarPoint) &&
(japaneseSentence.includes('。') || japaneseSentence.includes('、') || japaneseSentence.length > 25)) {
// Look for English translation in next lines
for (let j = i + 1; j < Math.min(i + 4, lines.length); j++) {
const nextLine = lines[j];
if (this.isEnglishTranslation(nextLine)) {
let englishTranslation = nextLine.replace(/<[^>]*>/g, '').trim();
if (englishTranslation && !englishTranslation.includes('get access to')) {
sentencePairs.push({
japanese: japaneseSentence,
english: englishTranslation
});
break;
}
}
}
}
}
}
// Create Anki cards from sentence pairs
sentencePairs.forEach((pair, index) => {
// Highlight the grammar point in the Japanese sentence
let frontText = pair.japanese;
if (grammarPoint && frontText.includes(grammarPoint)) {
frontText = frontText.replace(
new RegExp(grammarPoint.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g'),
`<strong>${grammarPoint}</strong>`
);
}
// Create the back with English translation and grammar info
const cleanGrammarPoint = this.removeFurigana(grammarPoint);
const grammarInfo = `<a lang="ja" href="${grammarUrl}">${cleanGrammarPoint}</a>${grammarMeaning ? ', ' + grammarMeaning : ''}`;
cards.push({
front: `<p>${frontText}</p>`,
back: `<p>${pair.english}</p>\n<p>${grammarInfo}</p>`
});
});
return cards;
}
// Helper function to remove furigana from Japanese text
removeFurigana(text) {
// Remove various furigana patterns:
// 如(ごと)く -> 如く
// 1(いっ)週間 -> 1週間
// A(エー) -> A
// 母親(ははおや) -> 母親
return text
// Remove readings in full-width parentheses (most common)
.replace(/([一-龯々a-zA-Z0-9]+)([\u3040-\u309f\u30a0-\u30ff]+)/g, '$1')
// Remove readings in half-width parentheses
.replace(/([一-龯々a-zA-Z0-9]+)\([\u3040-\u309f\u30a0-\u30ff]+\)/g, '$1')
// Handle any remaining patterns with numbers/letters
.replace(/([0-9a-zA-Z]+)([\u3040-\u309f\u30a0-\u30ff]+)/g, '$1')
.replace(/([0-9a-zA-Z]+)\([\u3040-\u309f\u30a0-\u30ff]+\)/g, '$1');
}
// Helper function to identify metadata lines
isMetadataLine(line) {
const metadataKeywords = [
'href', 'class', 'Structure', 'Details', 'Register', 'About',
'Examples', '--:--', 'Premium', 'Self-Study', 'Online', 'Offline',
'Grammar Discussion', 'Most Recent', 'Join', 'Bunpro tracks'
];
return metadataKeywords.some(keyword => line.includes(keyword));
}
// Helper function to identify English translations
isEnglishTranslation(line) {
const englishCharCount = (line.match(/[a-zA-Z]/g) || []).length;
const totalCharCount = line.replace(/\s/g, '').length;
const englishRatio = totalCharCount > 0 ? englishCharCount / totalCharCount : 0;
return englishRatio > 0.5 &&
line.length > 10 &&
line.length < 300 &&
!line.includes('<') &&
!line.includes('http') &&
!this.isMetadataLine(line);
}
// Format cards for Anki import using proper quoting for multi-line cards
formatCardsForAnki(cards) {
return cards.map(card => {
// Transform all quotes to double quotes and escape existing double quotes
const front = card.front.replace(/"/g, '""');
const back = card.back.replace(/"/g, '""');
// Put quotes around the front and back to allow multi-line content
return `"${front}"\t"${back}"`;
}).join('\n');
}
// Main function to process all grammar points
async generateAllCards() {
console.log('Starting Bunpro N1 Anki card generation...');
try {
// Step 1: Fetch the main deck page
console.log('Fetching main deck page...');
const deckResponse = await fetch(this.deckUrl);
const deckHtml = await deckResponse.text();
// Step 2: Extract all grammar point URLs using HTML parsing
console.log('Extracting grammar points from HTML...');
const grammarPoints = await this.extractGrammarPointUrls(deckHtml);
if (grammarPoints.length === 0) {
throw new Error('No grammar points found in deck page');
}
// Step 3: Process grammar points
const pointsToProcess = grammarPoints;
console.log(`Found ${grammarPoints.length} total grammar points`);
console.log(`Processing first ${pointsToProcess.length} grammar points...`);
console.log('Grammar points found:', grammarPoints.map(p => p.name).join(', '));
for (const [index, grammarPoint] of pointsToProcess.entries()) {
console.log(`\nProcessing ${index + 1}/${pointsToProcess.length}: ${grammarPoint.name}`);
try {
// Fetch the individual grammar point page
const response = await fetch(grammarPoint.url);
if (!response.ok) {
throw new Error(`HTTP ${response.status}`);
}
const content = await response.text();
// Parse and create cards
const cards = this.parseGrammarPointPage(
content,
grammarPoint.name,
grammarPoint.url,
grammarPoint.description
);
this.allCards.push(...cards);
console.log(`✓ Generated ${cards.length} cards for ${grammarPoint.name}`);
// Add delay to be respectful to the server
if (index < pointsToProcess.length - 1) {
await new Promise(resolve => setTimeout(resolve, 1000));
}
} catch (error) {
console.error(`✗ Failed to process ${grammarPoint.name}:`, error.message);
// Add a sample card even if parsing fails
const cleanGrammarPoint = this.removeFurigana(grammarPoint.name);
this.allCards.push({
front: `<p>${cleanGrammarPoint}</p>`,
back: `<p>${grammarPoint.description}</p>\n<p><a lang="ja" href="${grammarPoint.url}">${cleanGrammarPoint}</a></p>`
});
}
}
} catch (error) {
console.error('Error in main generation process:', error.message);
// Fallback to sample data if web scraping fails
console.log('Falling back to sample data...');
const sampleCards = this.parseGrammarPointPage(
this.getSampleContent(),
'に至るまで',
'https://bunpro.jp/grammar_points/%E3%81%AB%E8%87%B3%E3%82%8B%E3%81%BE%E3%81%A7',
'(Everything) from...to..., From…up until..., Starting with...ending with...'
);
this.allCards.push(...sampleCards);
}
return this.allCards;
}
// Sample content for fallback
getSampleContent() {
return `Examples
にしまさんはバーテンダーから大企業の社長に至るまでいろいろな仕事を歴任した。
Nishima-san held various positions, from a bartender to a president of a large company.
今日では、CPUはパソコンから歯ブラシに至るまで、さまざまな装置に見られる。
Nowadays, CPUs can be found in various kinds of devices, from computers to toothbrushes, etc.
総理大臣から一般人に至るまであらゆる市民は法のもとで平等である。
All citizens are equal under the law, from the prime minister to an ordinary person.
航海の初めから事故そのものに至るまでの難破船の航海日誌が見つかった。
A shipwreck's logbook covering everything from the beginning of the voyage to the accident itself has been found.
アジアでは、箸は昔より現在に至るまで長きに渡って、食べるために使われている。
In Asia, chopsticks have been used to eat for a long time, from long ago to the present.`;
}
// Generate the final Anki import file
async generateAnkiFile() {
const cards = await this.generateAllCards();
const ankiFormat = this.formatCardsForAnki(cards);
// Write to file
const filename = 'bunpro_n1_cards.txt';
fs.writeFileSync(filename, ankiFormat);
console.log('\n=== ANKI CARDS GENERATED ===');
console.log(`✅ Successfully created ${filename} with ${cards.length} cards`);
console.log('📁 File location: ' + process.cwd() + '/' + filename);
console.log('\n📝 To import into Anki:');
console.log('1. Open Anki and go to File > Import');
console.log('2. Select the bunpro_n1_cards.txt file');
console.log('3. Set field separator to "Tab"');
console.log('4. Enable "Allow HTML in fields"');
console.log('5. Map fields: Field 1 → Front, Field 2 → Back');
console.log('6. Click Import');
return ankiFormat;
}
}
// Usage example
async function main() {
const generator = new BunproAnkiGenerator();
await generator.generateAnkiFile();
console.log('\n⚙️ Configuration Notes:');
console.log('- Currently processes first 10 grammar points (configurable in generateAllCards)');
console.log('- Change maxPointsToProcess variable to process more/all grammar points');
console.log('- Includes 1-second delays between requests to be respectful to server');
console.log('- Uses JSON data extraction for maximum accuracy');
console.log('- Removes furigana from Japanese text for cleaner cards');
console.log('- Supports multi-line cards with proper quoting');
console.log('- No hard-coded grammar points - extracts everything dynamically');
}
// 🎉 ENHANCED BUNPRO ANKI GENERATOR
// The script now includes advanced features:
// 1. Fetches the Bunpro N1 deck page and extracts ALL grammar points dynamically
// 2. Uses multiple HTML parsing methods with comprehensive fallbacks
// 3. For each grammar point page, extracts data from __NEXT_DATA__ JSON structure
// 4. Gets clean Japanese sentences and English translations from studyQuestions array
// 5. Properly handles cloze deletions and replaces placeholders with grammar points
// 6. Filters out writeup-only content and focuses on example sentences
// 7. Creates properly formatted Anki cards with highlighted grammar points
// 8. Removes furigana from Japanese text for cleaner, more readable cards
// 9. Supports multi-line cards with proper CSV quoting (no <br> tags needed)
// 10. Writes output directly to bunpro_n1_cards.txt file using Node.js fs module
// 11. Includes comprehensive error handling with HTML fallback parsing
// 12. No hard-coded content - everything is extracted dynamically from live website
// Key Features:
// - Dynamic Grammar Point Discovery: Finds ALL points on the page automatically
// - JSON Data Extraction: Uses __NEXT_DATA__ for clean, structured sentence data
// - Furigana Removal: Converts 母親(ははおや)-> 母親 for cleaner cards
// - Multi-line Support: Uses proper CSV quoting instead of <br> tags
// - File Output: Automatically creates bunpro_n1_cards.txt ready for Anki import
// - Error Handling: Multiple fallback methods ensure reliable operation
// JSON Data Structure Parsed:
// - props.pageProps.included.studyQuestions[] contains example sentences
// - Each question has: content (Japanese), translation (English), answer (grammar point)
// - Automatically replaces <span class='study-area-input'>____</span> with correct answer
// - Filters by question_type and used_in to get the best example sentences
// To process ALL grammar points found on the page:
// 1. Increase maxPointsToProcess in generateAllCards() or remove the limit entirely
// 2. Adjust delay between requests if needed (currently 1 second)
// 3. The script will automatically find and process every grammar point on the page
// 4. Each grammar point typically yields 5-10 high-quality example sentence cards
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment