Created
April 5, 2017 09:05
-
-
Save yoavst/6f2bfb1d35d5395f8682ba62255e181a to your computer and use it in GitHub Desktop.
Parse psychometric exam pdf file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.yoavst.psychometric | |
import com.yoavst.resourceOf | |
import org.apache.pdfbox.pdmodel.PDDocument | |
import org.apache.pdfbox.text.PDFTextStripper | |
import org.apache.pdfbox.text.TextPosition | |
fun main(args: Array<String>) { | |
val file = File("psychometric/exam.pdf") | |
val document = PDDocument.load(file) | |
val indexes = getIndexes(document) | |
val answers = getAnswers(document, indexes.first { (name, _) -> name == AnswersPage }.number + 1) | |
val chapters = createChapters(indexes, answers) | |
val grades = getGrades(document, indexes.first { (name, _) -> name == GradePage }.number + 1) | |
} | |
fun getAnswers(document: PDDocument, page: Int): List<Answers> { | |
val chapters = mutableListOf<Pair<String, Int>>() // name to y | |
val answers = mutableListOf<Pair<String, Int>>() | |
val stripper = object : PDFTextStripper() { | |
override fun writeString(text: String, textPositions: List<TextPosition>) { | |
if (ChapterLine in text) | |
chapters += text to textPositions[0].y.toInt() | |
else if (AnswersLine in text) | |
answers += text.replace(AnswersLine, "").trim() to textPositions[0].y.toInt() | |
} | |
} | |
stripper.startPage = page | |
stripper.endPage = page | |
stripper.getText(document) | |
return answers.map { (data, y) -> | |
Answers( | |
chapters.sortedByDescending { it.second }.first { (_, chapterY) -> y - chapterY > 0 }.first, | |
IntArray(data.length) { data[it] - '0' } | |
) | |
} | |
} | |
fun getIndexes(document: PDDocument): List<Page> { | |
val stripper = PDFTextStripper() | |
stripper.startPage = 2 | |
stripper.endPage = 2 | |
val text = stripper.getText(document) | |
return text.split("\r\n").filter { "...." in it }.map { it.replace("[.]{2,}".toRegex(), ".").replace('.', ' ').trim() }.mapNotNull { | |
"\\d+".toRegex().find(it)?.let { match -> | |
Page(it.removeRange(match.range).trim(), match.value.toInt()) | |
} | |
} | |
} | |
fun createChapters(pages: List<Page>, answers: List<Answers>): List<Chapter> { | |
return pages.sortedBy(Page::number).let { pages -> | |
answers.map { (name, answers) -> | |
val mappedName = name.replace(" - ", " ") | |
val pageIndex = pages.indexOfFirst { it.name == mappedName } | |
Chapter(name, pages[pageIndex].number until pages[pageIndex + 1].number, answers) | |
} | |
} | |
} | |
fun getGrades(document: PDDocument, page: Int): List<Grade> { | |
val grades = arrayOfNulls<Grade>(47) | |
val stripper = object : PDFTextStripper() { | |
override fun writeString(text: String, textPositions: List<TextPosition>) { | |
if (text.toLongOrNull()?.takeIf { it != (page - 1).toLong() } != null) { | |
var pos = textPositions[0].x | |
var rightAnswers: String = textPositions[0].unicode | |
var i = 1 | |
while (i < textPositions.size) { | |
val position = textPositions[i] | |
if (Math.abs(position.x - pos) < 10) { | |
rightAnswers += position.unicode | |
i++ | |
pos = position.x | |
} else break | |
} | |
val leftText = text.substring(rightAnswers.length) | |
val numbers = IntArray(3) | |
var numbersIndex = 0 | |
var number = 0 | |
for (k in leftText.indices) { | |
val temp = number * 10 + (leftText[k] - '0') | |
if (temp > 150) { | |
numbers[numbersIndex++] = number | |
number = (leftText[k] - '0') | |
} else { | |
number = temp | |
} | |
} | |
numbers[numbersIndex] = number | |
grades[rightAnswers.toInt()] = Grade(numbers[0].takeIf { it != 0 }, numbers[1].takeIf { it != 0 }, numbers[2].takeIf { it != 0 }) | |
} | |
} | |
} | |
stripper.startPage = page | |
stripper.endPage = page | |
stripper.getText(document) | |
return grades.requireNoNulls().toList() | |
} | |
data class Page(val name: String, val number: Int) | |
data class Answers(val name: String, val answers: IntArray) { | |
override fun equals(other: Any?): Boolean = other is Answers && other.name == name && other.answers.contentEquals(answers) | |
override fun hashCode(): Int = name.hashCode() * 31 + answers.contentHashCode() | |
} | |
data class Chapter(val name: String, val pages: IntRange, val answers: IntArray) { | |
override fun equals(other: Any?): Boolean = other is Chapter && other.name == name && other.pages == pages && other.answers.contentEquals(answers) | |
override fun hashCode(): Int = (name.hashCode() * 31 + pages.hashCode()) * 31 + answers.contentHashCode() | |
} | |
data class Grade(val hebrew: Int?, val math: Int?, val english: Int?) | |
val AnswersPage = "מפתח תשובות נכונות" | |
val GradePage = "חישוב אומדן ציוני הבחינה" | |
val ChapterLine = "פרק" | |
val AnswersLine = "הנכונה" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment