Skip to content

Instantly share code, notes, and snippets.

@kozakana
Last active December 26, 2019 14:06
Show Gist options
  • Save kozakana/7bfa398da9ff5458bffdf99c56deeada to your computer and use it in GitHub Desktop.
Save kozakana/7bfa398da9ff5458bffdf99c56deeada to your computer and use it in GitHub Desktop.
js-unigram
const Document = require('./document')
const fs = require('fs')
const TOKEN_DIR = './tokens'
const DOC_DIR = './docs'
const DOC_DATA = './docs.json'
const docFiles = fs.readdirSync(DOC_DIR).filter(fileName => fileName.match(/.+\.txt$/))
const docs = {}
const tokenFiles = fs.readdirSync(TOKEN_DIR).filter(fileName => fileName.match(/.+\.txt$/))
const removeTokenPath = tokenFiles.map((fileName)=>{
return new Promise((resolve, reject)=>{
fs.unlink(`${TOKEN_DIR}/${fileName}`, (err)=>{
if(err){
reject(err)
}else{
resolve()
}
})
})
})
Promise.all(removeTokenPath).then(()=>{
for(let i=0; i<docFiles.length; i++){
docs[i] = {}
docs[i].name = docFiles[i]
docs[i].path = `${DOC_DIR}/${docFiles[i]}`
doc = new Document(docs[i].path, i)
doc.saveTokens(TOKEN_DIR)
}
fs.writeFileSync(DOC_DATA, JSON.stringify(docs), 'utf8')
})
const fs = require('fs')
module.exports = class {
constructor(filePath, documentId, option={}){
this.text = fs.readFileSync(filePath, 'utf-8')
this.documentId = `${documentId}`
}
tokenizer(text){
return Array.from(text)
}
filter(tokenizedText){
return tokenizedText.map((token)=>{
return token.toLowerCase()
})
}
tokenClassification(){
const tokenPosition = {}
const tokenizedText = this.tokenizer(this.text)
const tokens = this.filter(tokenizedText)
for(let i=0; i<tokens.length; i++){
const token = tokens[i]
if(tokenPosition[token]){
tokenPosition[token].push(i)
}else{
tokenPosition[token] = [i]
}
}
return tokenPosition
}
saveTokens(outputFileDir){
const tokenPosition = this.tokenClassification()
Object.keys(tokenPosition).forEach((token)=>{
const positionText = "\n" + this.documentId + ':' + tokenPosition[token].join(',')
const outputFile = `${outputFileDir}/${token.charCodeAt(0)}.txt`
fs.appendFileSync(outputFile, positionText)
})
}
}
const fs = require('fs')
const parseTokenData = (line)=>{
let tmpStr = ''
let docId
const positions = []
for (var i=0; i< line.length; i++) {
if(line[i] === ','){
positions.push(parseInt(tmpStr, 10))
tmpStr = ''
}else if(line[i] === ':'){
docId = parseInt(tmpStr, 10)
tmpStr = ''
}else{
tmpStr += line[i]
}
}
positions.push(parseInt(tmpStr, 10))
return {
docId: docId,
positions: positions
}
}
const getTokenData = (tokenFile)=>{
return new Promise((resolve, reject)=>{
console.log(tokenFile)
fs.readFile(tokenFile, 'utf-8', (err, data)=>{
if(err){
if(err.code === 'ENOENT'){
resolve(null)
}else{
reject(err)
}
}
resolve(data)
})
})
}
const getNextPositions = (currentArr, nextArr)=>{
const dupArr = []
currentArr.forEach((cur)=>{
if(nextArr.indexOf(cur+1) !== -1){
dupArr.push(cur+1)
}
})
return dupArr
}
const getTokenList = (keyword, dir)=>{
const tokens = Array.from(keyword)
const tokenList = tokens.map((token)=>{
const tokenFile = `${dir}/${token.charCodeAt(0)}.txt`
return getTokenData(tokenFile)
})
return tokenList
}
const tokenListParse = (tokenList)=>{
return new Promise((resolve, reject)=>{
Promise.all(tokenList).then((arr)=>{
const documentPositionList = {}
arr.forEach((str)=>{
if(!str) return
str.split("\n").forEach((line)=>{
if(line){
const {docId, positions} = parseTokenData(line)
if(documentPositionList[docId]){
documentPositionList[docId].push(positions)
}else{
documentPositionList[docId] = [positions]
}
}
})
})
resolve(documentPositionList)
}).catch((err)=>{
console.log(err)
reject(err)
})
})
}
const connectTokens = (documentPositionList)=>{
const docIdConnectNum = {}
Object.keys(documentPositionList).forEach((documentId)=>{
const tokenPositionList = documentPositionList[documentId]
docIdConnectNum[documentId] = 1
tokenPositionList.reduce((current, next)=>{
const nextPositions = getNextPositions(current, next)
if(nextPositions.length > 0) docIdConnectNum[documentId]++
return nextPositions
})
})
return docIdConnectNum
}
module.exports = (keyword, dir)=>{
const tokenList = getTokenList(keyword, dir)
return new Promise((resolve, reject)=>{
tokenListParse(tokenList).then((documentPositionList)=>{
const docIdConnectNum = connectTokens(documentPositionList)
const matchedDocId = Object.keys(docIdConnectNum).filter((docId)=>{
const connectNum = docIdConnectNum[docId]
return connectNum === Array.from(keyword).length
})
resolve(matchedDocId)
}).catch((err)=>{
console.log(err)
reject(err)
})
})
}
const search = require('./search')
const fs = require('fs')
const readlineSync = require('readline-sync')
const TOKEN_DIR = './tokens'
const DOC_DATA = './docs.json'
const keyword = readlineSync.question('> ');
search(keyword, TOKEN_DIR).then((matchedDocIds)=>{
const docData = JSON.parse(fs.readFileSync(DOC_DATA, 'utf8'))
matchedDocIds.forEach((docId)=>{
console.log(docData[docId])
})
}).catch((err)=>{
console.log(err)
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment