Skip to content

Instantly share code, notes, and snippets.

@yoavst
Created April 5, 2017 09:05
Show Gist options
  • Save yoavst/6f2bfb1d35d5395f8682ba62255e181a to your computer and use it in GitHub Desktop.
Save yoavst/6f2bfb1d35d5395f8682ba62255e181a to your computer and use it in GitHub Desktop.
Parse psychometric exam pdf file
package com.yoavst.psychometric
import com.yoavst.resourceOf
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.text.PDFTextStripper
import org.apache.pdfbox.text.TextPosition
fun main(args: Array<String>) {
val file = File("psychometric/exam.pdf")
val document = PDDocument.load(file)
val indexes = getIndexes(document)
val answers = getAnswers(document, indexes.first { (name, _) -> name == AnswersPage }.number + 1)
val chapters = createChapters(indexes, answers)
val grades = getGrades(document, indexes.first { (name, _) -> name == GradePage }.number + 1)
}
fun getAnswers(document: PDDocument, page: Int): List<Answers> {
val chapters = mutableListOf<Pair<String, Int>>() // name to y
val answers = mutableListOf<Pair<String, Int>>()
val stripper = object : PDFTextStripper() {
override fun writeString(text: String, textPositions: List<TextPosition>) {
if (ChapterLine in text)
chapters += text to textPositions[0].y.toInt()
else if (AnswersLine in text)
answers += text.replace(AnswersLine, "").trim() to textPositions[0].y.toInt()
}
}
stripper.startPage = page
stripper.endPage = page
stripper.getText(document)
return answers.map { (data, y) ->
Answers(
chapters.sortedByDescending { it.second }.first { (_, chapterY) -> y - chapterY > 0 }.first,
IntArray(data.length) { data[it] - '0' }
)
}
}
fun getIndexes(document: PDDocument): List<Page> {
val stripper = PDFTextStripper()
stripper.startPage = 2
stripper.endPage = 2
val text = stripper.getText(document)
return text.split("\r\n").filter { "...." in it }.map { it.replace("[.]{2,}".toRegex(), ".").replace('.', ' ').trim() }.mapNotNull {
"\\d+".toRegex().find(it)?.let { match ->
Page(it.removeRange(match.range).trim(), match.value.toInt())
}
}
}
fun createChapters(pages: List<Page>, answers: List<Answers>): List<Chapter> {
return pages.sortedBy(Page::number).let { pages ->
answers.map { (name, answers) ->
val mappedName = name.replace(" - ", " ")
val pageIndex = pages.indexOfFirst { it.name == mappedName }
Chapter(name, pages[pageIndex].number until pages[pageIndex + 1].number, answers)
}
}
}
fun getGrades(document: PDDocument, page: Int): List<Grade> {
val grades = arrayOfNulls<Grade>(47)
val stripper = object : PDFTextStripper() {
override fun writeString(text: String, textPositions: List<TextPosition>) {
if (text.toLongOrNull()?.takeIf { it != (page - 1).toLong() } != null) {
var pos = textPositions[0].x
var rightAnswers: String = textPositions[0].unicode
var i = 1
while (i < textPositions.size) {
val position = textPositions[i]
if (Math.abs(position.x - pos) < 10) {
rightAnswers += position.unicode
i++
pos = position.x
} else break
}
val leftText = text.substring(rightAnswers.length)
val numbers = IntArray(3)
var numbersIndex = 0
var number = 0
for (k in leftText.indices) {
val temp = number * 10 + (leftText[k] - '0')
if (temp > 150) {
numbers[numbersIndex++] = number
number = (leftText[k] - '0')
} else {
number = temp
}
}
numbers[numbersIndex] = number
grades[rightAnswers.toInt()] = Grade(numbers[0].takeIf { it != 0 }, numbers[1].takeIf { it != 0 }, numbers[2].takeIf { it != 0 })
}
}
}
stripper.startPage = page
stripper.endPage = page
stripper.getText(document)
return grades.requireNoNulls().toList()
}
data class Page(val name: String, val number: Int)
data class Answers(val name: String, val answers: IntArray) {
override fun equals(other: Any?): Boolean = other is Answers && other.name == name && other.answers.contentEquals(answers)
override fun hashCode(): Int = name.hashCode() * 31 + answers.contentHashCode()
}
data class Chapter(val name: String, val pages: IntRange, val answers: IntArray) {
override fun equals(other: Any?): Boolean = other is Chapter && other.name == name && other.pages == pages && other.answers.contentEquals(answers)
override fun hashCode(): Int = (name.hashCode() * 31 + pages.hashCode()) * 31 + answers.contentHashCode()
}
data class Grade(val hebrew: Int?, val math: Int?, val english: Int?)
val AnswersPage = "מפתח תשובות נכונות"
val GradePage = "חישוב אומדן ציוני הבחינה"
val ChapterLine = "פרק"
val AnswersLine = "הנכונה"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment