Last active
April 6, 2023 08:33
-
-
Save mrpapercut/6c11e215b4b5b528ac0ff678dae1e297 to your computer and use it in GitHub Desktop.
T5K primelist parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
class T5KParser { | |
private $primelistURL = 'https://t5k.org/primes/lists/all.txt'; | |
private $primelistRaw; | |
public $primes = []; | |
public $proofcodes = []; | |
public function parse() { | |
$this->getList(); | |
$this->parseList(); | |
} | |
private function getList() { | |
$ch = curl_init(); | |
$curlOpts = [ | |
CURLOPT_URL => $this->primelistURL, | |
CURLOPT_RETURNTRANSFER => true | |
]; | |
curl_setopt_array($ch, $curlOpts); | |
$rawlist = curl_exec($ch); | |
curl_close($ch); | |
$this->primelistRaw = $rawlist; | |
} | |
private function parseList() { | |
if (is_null($this->primelistRaw)) { | |
throw new Exception('Raw file is empty'); | |
return; | |
} | |
$this->parsePrimes(); | |
$this->parseProofcodes(); | |
} | |
private function getPrimesFromRawList() { | |
$lines = explode(PHP_EOL, $this->primelistRaw); | |
$lineCount = count($lines); | |
$foundlines = []; | |
$line_idx = 31; // Skip intro header | |
$prime_idx = 1; | |
while ($line_idx < $lineCount) { | |
$line = trim($lines[$line_idx]); | |
$line_idx++; | |
if (!str_starts_with($line, $prime_idx)) { | |
if (str_ends_with($foundlines[$prime_idx - 1], '\\')) { | |
$foundlines[$prime_idx - 1] = str_replace('\\', '', $foundlines[$prime_idx - 1]) . $line; | |
} else { | |
$foundlines[$prime_idx - 1] .= ' ' . $line; | |
} | |
} else { | |
$foundlines[$prime_idx] = $line; | |
$prime_idx++; | |
} | |
} | |
return $foundlines; | |
} | |
private function parsePrimes() { | |
$rawlines = $this->getPrimesFromRawList(); | |
/* | |
99% of cases | |
Regex 1: ^(\d{1,4})([a-z])?\s+(\S*)\s+(\d+)\s([A-Za-z0-9]+)\s+(\d+)(.*) | |
Cases with sum like "10000000000000000000...(34053 other digits)...00000000000000532669" | |
Regex 2: ^(\d{1,4})([a-z])?\s+(\d+\.\.\.[\(\)\w\s]+\.\.\.\d+)\s+(\d+)\s([A-Za-z0-9]+)\s+(\d+)(.*)? | |
Case "5163e Ramanujan tau function at 199^4518 ECPP 57125 E3 2022 ECPP" | |
Regex 3: ^(\d{1,4})([a-z])?\s+([\w\s]+[\d\^]+[\sA-Za-z]+)\s+(\d+)\s([A-Za-z0-9]+)\s+(\d+)(.*)? | |
*/ | |
$regex1 = '/^(\d{1,4})([a-z])?\s+(\S*)\s+(\d+)\s([A-Za-z0-9]+)\s+(\d+)(.*)/'; | |
$regex2 = '/^(\d{1,4})([a-z])?\s+(\d+\.\.\.[\(\)\w\s]+\.\.\.\d+)\s+(\d+)\s([A-Za-z0-9]+)\s+(\d+)(.*)?/'; | |
$regex3 = '/^(\d{1,4})([a-z])?\s+([\w\s]+[\d\^]+[\sA-Za-z]+)\s+(\d+)\s([A-Za-z0-9]+)\s+(\d+)(.*)?/'; | |
foreach ($rawlines as $line) { | |
preg_match($regex1, $line, $matches); | |
if (count($matches) <= 1) preg_match($regex2, $line, $matches); | |
if (count($matches) <= 1) preg_match($regex3, $line, $matches); | |
if (count($matches) <= 1) { | |
var_dump("Could not parse line '" . $line . "'"); | |
continue; | |
} | |
$rank = intval($matches[1]); | |
$ranknote = $matches[2]; | |
$sum = trim($matches[3]); | |
$digitlength = intval($matches[4]); | |
$proofcode = $matches[5]; | |
$year = intval($matches[6]); | |
$comment = $matches[7] ? trim($matches[7]) : ''; | |
$checksum = dechex(crc32($sum)); | |
array_push($this->primes, [ | |
'rank' => $rank, | |
'ranknote' => $ranknote, | |
'sum' => $sum, | |
'digitlength' => $digitlength, | |
'proofcode' => $proofcode, | |
'year' => $year, | |
'comment' => $comment, | |
'checksum' => $checksum | |
]); | |
} | |
} | |
private function parseProofcodes() { | |
$lines = explode(PHP_EOL, $this->primelistRaw); | |
$lineCount = count($lines); | |
$sectionStarted = false; | |
$proofcodeRegex = '/^([A-Za-z0-9]+)\s+(.*)$/'; | |
$line_idx = 31; // Skip intro header | |
while ($line_idx < $lineCount) { | |
$line = $lines[$line_idx]; | |
$line_idx++; | |
if ($sectionStarted === true) { | |
preg_match($proofcodeRegex, $line, $matches); | |
if (count($matches) > 1 && $matches[1] !== 'KEY') { | |
$proofcode = $matches[1]; | |
$provers = $matches[2]; | |
array_push($this->proofcodes, [ | |
'proofcode' => $proofcode, | |
'provers' => $provers | |
]); | |
} | |
} | |
if ($sectionStarted === false && str_starts_with('KEY TO PROOF-CODES', $line)) { | |
$sectionStarted = true; | |
} | |
} | |
} | |
} | |
$t5kparser = new T5KParser(); | |
$t5kparser->parse(); | |
/* | |
Primes: $t5kparser->primes | |
Proofcodes: $t5kparser->proofcodes | |
*/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// npm install --save axios crc32 | |
const axios = require('axios'); | |
const crc32 = require('crc-32'); | |
class T5KParser { | |
constructor() { | |
this.primelistURL = 'https://t5k.org/primes/lists/all.txt'; | |
} | |
getChecksum(inputStr) { | |
const seed = 0x04C11DB7; | |
const checksum = crc32.buf(Buffer.from(inputStr, 'binary'), seed); | |
const bytearr = Uint8Array.of( | |
(checksum & 0xff000000) >> 24, | |
(checksum & 0x00ff0000) >> 16, | |
(checksum & 0x0000ff00) >> 8, | |
(checksum & 0x000000ff) >> 0, | |
); | |
let hexOut = ''; | |
for (const idx in bytearr) { | |
hexOut += bytearr[idx].toString(16).padStart(2, '0') | |
} | |
return hexOut; | |
} | |
getProofcodesFromList(filecontents) { | |
const lines = filecontents.split('\n').map(l => l.trim()); | |
let sectionStarted = false; | |
let line_idx = 31; | |
const proofcodeLineRegex = /^([A-Za-z0-9]+)\s+(.*)$/; | |
const proofcodes = []; | |
while (line_idx < lines.length) { | |
const line = lines[line_idx]; | |
line_idx++; | |
if (sectionStarted && proofcodeLineRegex.test(line)) { | |
const [_, proofcode, provers] = line.match(proofcodeLineRegex); | |
proofcodes.push({ | |
proofcode, | |
provers | |
}); | |
} | |
if (line.startsWith('KEY TO PROOF-CODES')) { | |
sectionStarted = true; | |
} | |
} | |
return proofcodes; | |
} | |
getPrimesFromList(filecontents) { | |
const lines = filecontents.split('\n').map(l => l.trim()); | |
const newlines = []; | |
let line_idx = 31; // Skip the intro header | |
let prime_idx = 1; | |
while (line_idx < lines.length) { | |
const line = lines[line_idx]; | |
line_idx++; | |
if (line.startsWith('-----')) { | |
break; | |
} | |
if (!line.startsWith(prime_idx)) { | |
if (newlines[prime_idx - 1].endsWith('\\')) { | |
newlines[prime_idx - 1] = newlines[prime_idx - 1].slice(0, -1) + line; | |
} else { | |
newlines[prime_idx - 1] += ` ${line}`; | |
} | |
} else { | |
newlines[prime_idx] = line; | |
prime_idx++; | |
} | |
} | |
return newlines.filter(l => l); | |
} | |
async parse() { | |
const primelist = await axios.get(this.primelistURL); | |
this.proofcodes = this.parseProofcodes(primelist.data); | |
this.primes = this.parsePrimes(primelist.data); | |
} | |
parseProofcodes(filecontents) { | |
return this.getProofcodesFromList(filecontents); | |
} | |
parsePrimes(filecontents) { | |
const lines = this.getPrimesFromList(filecontents); | |
const parsedList = []; | |
// 99% of cases | |
const primeRegex1 = /^(\d{1,4})([a-z])?\s+(\S*)\s+(\d+)\s([A-Za-z0-9]+)\s+(\d+)(.*)/; | |
// Cases with sum like "10000000000000000000...(34053 other digits)...00000000000000532669" | |
const primeRegex2 = /^(\d{1,4})([a-z])?\s+(\d+\.\.\.[\(\)\w\s]+\.\.\.\d+)\s+(\d+)\s([A-Za-z0-9]+)\s+(\d+)(.*)?/; | |
// Case "5163e Ramanujan tau function at 199^4518 ECPP 57125 E3 2022 ECPP" | |
const primeRegex3 = /^(\d{1,4})([a-z])?\s+([\w\s]+[\d\^]+[\sA-Za-z]+)\s+(\d+)\s([A-Za-z0-9]+)\s+(\d+)(.*)?/; | |
let prime_idx = 0; | |
lines.forEach(l => { | |
let usedRegex = null; | |
if (primeRegex1.test(l)) { | |
usedRegex = primeRegex1; | |
} else if (primeRegex2.test(l)) { | |
usedRegex = primeRegex2; | |
} else if (primeRegex3.test(l)) { | |
usedRegex = primeRegex3; | |
} else { | |
console.log(l); | |
return; | |
} | |
let [_, rank, rankNote, sum, digitlength, proofcode, year, comment] = l.match(usedRegex); | |
rank = parseInt(rank, 10); | |
sum = sum.trim(); | |
digitlength = parseInt(digitlength, 10); | |
year = parseInt(year, 10); | |
comment = comment ? comment.trim() : ''; | |
parsedList[prime_idx] = { | |
rank, | |
rankNote, | |
sum, | |
digitlength, | |
proofcode, | |
year, | |
comment, | |
provers: this.parsedProofcodes.find(p => p.proofcode === proofcode).provers, | |
checksum: this.getChecksum(sum) | |
}; | |
prime_idx++; | |
}); | |
return parsedList; | |
} | |
} | |
const t5kparser = new T5KParser(); | |
(async () => { | |
await parser.parse(); | |
/* | |
Primes: t5kparser.primes | |
Proofcodes: t5kparser.proofcodes | |
*/ | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment