Last active
December 26, 2019 14:06
-
-
Save kozakana/7bfa398da9ff5458bffdf99c56deeada to your computer and use it in GitHub Desktop.
js-unigram
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const Document = require('./document') | |
const fs = require('fs') | |
const TOKEN_DIR = './tokens' | |
const DOC_DIR = './docs' | |
const DOC_DATA = './docs.json' | |
const docFiles = fs.readdirSync(DOC_DIR).filter(fileName => fileName.match(/.+\.txt$/)) | |
const docs = {} | |
const tokenFiles = fs.readdirSync(TOKEN_DIR).filter(fileName => fileName.match(/.+\.txt$/)) | |
const removeTokenPath = tokenFiles.map((fileName)=>{ | |
return new Promise((resolve, reject)=>{ | |
fs.unlink(`${TOKEN_DIR}/${fileName}`, (err)=>{ | |
if(err){ | |
reject(err) | |
}else{ | |
resolve() | |
} | |
}) | |
}) | |
}) | |
Promise.all(removeTokenPath).then(()=>{ | |
for(let i=0; i<docFiles.length; i++){ | |
docs[i] = {} | |
docs[i].name = docFiles[i] | |
docs[i].path = `${DOC_DIR}/${docFiles[i]}` | |
doc = new Document(docs[i].path, i) | |
doc.saveTokens(TOKEN_DIR) | |
} | |
fs.writeFileSync(DOC_DATA, JSON.stringify(docs), 'utf8') | |
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs') | |
module.exports = class { | |
constructor(filePath, documentId, option={}){ | |
this.text = fs.readFileSync(filePath, 'utf-8') | |
this.documentId = `${documentId}` | |
} | |
tokenizer(text){ | |
return Array.from(text) | |
} | |
filter(tokenizedText){ | |
return tokenizedText.map((token)=>{ | |
return token.toLowerCase() | |
}) | |
} | |
tokenClassification(){ | |
const tokenPosition = {} | |
const tokenizedText = this.tokenizer(this.text) | |
const tokens = this.filter(tokenizedText) | |
for(let i=0; i<tokens.length; i++){ | |
const token = tokens[i] | |
if(tokenPosition[token]){ | |
tokenPosition[token].push(i) | |
}else{ | |
tokenPosition[token] = [i] | |
} | |
} | |
return tokenPosition | |
} | |
saveTokens(outputFileDir){ | |
const tokenPosition = this.tokenClassification() | |
Object.keys(tokenPosition).forEach((token)=>{ | |
const positionText = "\n" + this.documentId + ':' + tokenPosition[token].join(',') | |
const outputFile = `${outputFileDir}/${token.charCodeAt(0)}.txt` | |
fs.appendFileSync(outputFile, positionText) | |
}) | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs') | |
const parseTokenData = (line)=>{ | |
let tmpStr = '' | |
let docId | |
const positions = [] | |
for (var i=0; i< line.length; i++) { | |
if(line[i] === ','){ | |
positions.push(parseInt(tmpStr, 10)) | |
tmpStr = '' | |
}else if(line[i] === ':'){ | |
docId = parseInt(tmpStr, 10) | |
tmpStr = '' | |
}else{ | |
tmpStr += line[i] | |
} | |
} | |
positions.push(parseInt(tmpStr, 10)) | |
return { | |
docId: docId, | |
positions: positions | |
} | |
} | |
const getTokenData = (tokenFile)=>{ | |
return new Promise((resolve, reject)=>{ | |
console.log(tokenFile) | |
fs.readFile(tokenFile, 'utf-8', (err, data)=>{ | |
if(err){ | |
if(err.code === 'ENOENT'){ | |
resolve(null) | |
}else{ | |
reject(err) | |
} | |
} | |
resolve(data) | |
}) | |
}) | |
} | |
const getNextPositions = (currentArr, nextArr)=>{ | |
const dupArr = [] | |
currentArr.forEach((cur)=>{ | |
if(nextArr.indexOf(cur+1) !== -1){ | |
dupArr.push(cur+1) | |
} | |
}) | |
return dupArr | |
} | |
const getTokenList = (keyword, dir)=>{ | |
const tokens = Array.from(keyword) | |
const tokenList = tokens.map((token)=>{ | |
const tokenFile = `${dir}/${token.charCodeAt(0)}.txt` | |
return getTokenData(tokenFile) | |
}) | |
return tokenList | |
} | |
const tokenListParse = (tokenList)=>{ | |
return new Promise((resolve, reject)=>{ | |
Promise.all(tokenList).then((arr)=>{ | |
const documentPositionList = {} | |
arr.forEach((str)=>{ | |
if(!str) return | |
str.split("\n").forEach((line)=>{ | |
if(line){ | |
const {docId, positions} = parseTokenData(line) | |
if(documentPositionList[docId]){ | |
documentPositionList[docId].push(positions) | |
}else{ | |
documentPositionList[docId] = [positions] | |
} | |
} | |
}) | |
}) | |
resolve(documentPositionList) | |
}).catch((err)=>{ | |
console.log(err) | |
reject(err) | |
}) | |
}) | |
} | |
const connectTokens = (documentPositionList)=>{ | |
const docIdConnectNum = {} | |
Object.keys(documentPositionList).forEach((documentId)=>{ | |
const tokenPositionList = documentPositionList[documentId] | |
docIdConnectNum[documentId] = 1 | |
tokenPositionList.reduce((current, next)=>{ | |
const nextPositions = getNextPositions(current, next) | |
if(nextPositions.length > 0) docIdConnectNum[documentId]++ | |
return nextPositions | |
}) | |
}) | |
return docIdConnectNum | |
} | |
module.exports = (keyword, dir)=>{ | |
const tokenList = getTokenList(keyword, dir) | |
return new Promise((resolve, reject)=>{ | |
tokenListParse(tokenList).then((documentPositionList)=>{ | |
const docIdConnectNum = connectTokens(documentPositionList) | |
const matchedDocId = Object.keys(docIdConnectNum).filter((docId)=>{ | |
const connectNum = docIdConnectNum[docId] | |
return connectNum === Array.from(keyword).length | |
}) | |
resolve(matchedDocId) | |
}).catch((err)=>{ | |
console.log(err) | |
reject(err) | |
}) | |
}) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const search = require('./search') | |
const fs = require('fs') | |
const readlineSync = require('readline-sync') | |
const TOKEN_DIR = './tokens' | |
const DOC_DATA = './docs.json' | |
const keyword = readlineSync.question('> '); | |
search(keyword, TOKEN_DIR).then((matchedDocIds)=>{ | |
const docData = JSON.parse(fs.readFileSync(DOC_DATA, 'utf8')) | |
matchedDocIds.forEach((docId)=>{ | |
console.log(docData[docId]) | |
}) | |
}).catch((err)=>{ | |
console.log(err) | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment