-
-
Save kabeer11000/a5347be6373e23ba256883623604245c to your computer and use it in GitHub Desktop.
Keras text tokenizer in JavaScript with minimal functionality
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/ | |
class Tokenizer { | |
constructor(config = {}) { | |
this.filters = config.filters || /[\\.,/#!$%^&*;:{}=\-_`~()]/g; | |
this.lower = typeof config.lower === 'undefined' ? true : config.lower; | |
// Primary indexing methods. Word to index and index to word. | |
this.wordIndex = {}; | |
this.indexWord = {}; | |
// Keeping track of word counts | |
this.wordCounts = {}; | |
} | |
cleanText(text) { | |
if (this.lower) text = text.toLowerCase(); | |
return text | |
.replace(this.filters, '') | |
.replace(/\s{2,}/g, ' ') | |
.split(' '); | |
} | |
fitOnTexts(texts) { | |
texts.forEach(text => { | |
text = this.cleanText(text); | |
text.forEach(word => { | |
this.wordCounts[word] = (this.wordCounts[word] || 0) + 1; | |
}); | |
}); | |
Object.entries(this.wordCounts) | |
.sort((a, b) => b[1] - a[1]) | |
.forEach(([word, number], i) => { | |
this.wordIndex[word] = i + 1; | |
this.indexWord[i + 1] = word; | |
}); | |
} | |
textsToSequences(texts) { | |
return texts.map(text => this.cleanText(text).map(word => this.wordIndex[word] || 0)); | |
} | |
toJson() { | |
return JSON.stringify({ | |
wordIndex: this.wordIndex, | |
indexWord: this.indexWord, | |
wordCounts: this.wordCounts | |
}) | |
} | |
} | |
export const tokenizerFromJson = json_string => { | |
const tokenizer = new Tokenizer(); | |
const js = JSON.parse(json_string); | |
tokenizer.wordIndex = js.wordIndex; | |
tokenizer.indexWord = js.indexWord; | |
tokenizer.wordCounts = js.wordCounts; | |
return tokenizer; | |
}; | |
export default Tokenizer; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/ | |
import Tokenizer, { tokenizerFromJson } from './tokenizer'; | |
describe('Tokenizer', () => { | |
it('should load from JSON', () => { | |
const tokenizer = new Tokenizer(); | |
tokenizer.wordIndex = { | |
hello: 1, | |
world: 2, | |
}; | |
tokenizer.indexWord = { | |
1: 'hello', | |
2: 'world' | |
}; | |
const recreated_tokenizer = tokenizerFromJson(tokenizer.toJson()); | |
expect(recreated_tokenizer.wordIndex).toEqual(tokenizer.wordIndex); | |
expect(recreated_tokenizer.indexWord).toEqual(tokenizer.indexWord); | |
}); | |
it('should respect the lower flag', () => { | |
const texts = ['hello hello Hello'] | |
// Test the default assumption | |
let tokenizer = new Tokenizer(); | |
tokenizer.fitOnTexts(texts); | |
expect(tokenizer.wordIndex).toEqual({ hello: 1 }) | |
// Test the lowercase flag | |
tokenizer = new Tokenizer({ lower: false }); | |
tokenizer.fitOnTexts(texts); | |
expect(tokenizer.wordIndex).toEqual({ hello: 1, Hello: 2 }) | |
}); | |
it('should tokenize texts and store metadata for the texts', () => { | |
const tokenizer = new Tokenizer(); | |
const texts = [ | |
'hello hello .,/#!$%^&*;:{}= \\ -_`~() hello Hello world world world', | |
'great success .,/#!$%^&*;:{}=\\-_`~() Success' | |
]; | |
tokenizer.fitOnTexts(texts); | |
const sequences = tokenizer.textsToSequences(texts); | |
expect(tokenizer.wordIndex).toEqual({ | |
hello: 1, | |
world: 2, | |
success: 3, | |
great: 4 | |
}); | |
expect(tokenizer.indexWord).toEqual({ | |
1: 'hello', | |
2: 'world', | |
3: 'success', | |
4: 'great' | |
}); | |
expect(tokenizer.wordCounts).toEqual({ | |
hello: 4, | |
world: 3, | |
success: 2, | |
great: 1 | |
}); | |
expect(sequences).toEqual([ | |
[1, 1, 1, 1, 2, 2, 2], | |
[4, 3, 3] | |
]); | |
}); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment