Skip to content

Instantly share code, notes, and snippets.

@marinsagovac
Created May 6, 2021 06:01
Show Gist options
  • Save marinsagovac/40dd3c46bdf19616b2b06363261bdfc8 to your computer and use it in GitHub Desktop.
Save marinsagovac/40dd3c46bdf19616b2b06363261bdfc8 to your computer and use it in GitHub Desktop.
psl fork in PHP from JS
<?php
// https://github.com/lupomontero/psl/blob/master/index.js
class Domain
{
protected ?string $input = null;
protected $tld = null;
protected $sld = null;
protected $domain = null;
protected $subdomain = null;
protected ?bool $listed = false;
public function setInput(string $input) {
$this->input = $input;
}
public function setTld(string $tld) {
$this->tld = $tld;
}
public function setSld(string $sld) {
$this->sld = $sld;
}
public function setDomain(string $domain) {
$this->domain = $domain;
}
public function setSubdomain(string $subdomain) {
$this->subdomain = $subdomain;
}
public function setListed(bool $listed) {
$this->listed = $listed;
}
public function getResult() : array {
return [
'input' => $this->input,
'tld' => $this->tld,
'sld' => $this->sld,
'domain' => $this->domain,
'subdomain' => $this->subdomain,
'listed' => $this->listed
];
}
public function getSld() : ?string
{
return $this->getSld();
}
public function getTld() : ?string
{
return $this->getTld();
}
}
class psl
{
// parse
public function __construct(string $input) {
$this->parse($input);
}
public function parse(string $input) {
// Force domain to lowercase
$lower = strtolower($input);
// FQDN, remove trailing dot
$domain = self::removeTrailingDot($lower);
// Validate, sanitize input
$error = $this->validate($domain);
if ($error) {
return [
'input' => $input,
'error' => [
'message' => $this->errorCodes($error),
'code' => $error
]
];
}
$d = new Domain();
$d->setInput($input);
$parsed = $d->getResult();
$domainParts = explode(".", $domain);
$domainPartsCount = count($domainParts);
// Non-Internet TLD
if ($domainParts[count($domainParts) - 1] === 'local') {
return $parsed;
}
// return $parsed;
$rule = $this->findRule($domain);
if (!$rule) {
if (count($domainParts) < 2) {
return $parsed;
}
$d->setTld(end($domainParts));
array_pop($domainParts);
$d->setSld(end($domainParts));
array_pop($domainParts);
$d->setDomain(join('.', [$d->getSld(), $d->getTld()]));
if (count($domainParts)) {
$d->setSubdomain(end($domainParts));
array_pop($domainParts);
}
}
$d->setListed(true);
$tldParts = explode('.', $rule["suffix"]);
$tsp = $tldParts;
$privateParts = array_splice($domainParts, 0, ($domainPartsCount - count($tldParts)));
$sp = $privateParts;
if ($rule["exception"]) {
array_push($privateParts, array_shift($tldParts));
}
$d->setTld(join(".", $tldParts));
if (!count($privateParts)) {
return 1;
// return handlePunycode();
}
// review
if ($rule["wildcard"]) {
$popedParts = array_pop($privateParts);
$tldParts = array_unshift($tldParts, $popedParts);
$d->setTld(join(".", $tldParts));
}
if (!count($privateParts)) {
return 1;
// return handlePunycode();
}
$d->setSld(array_pop($privateParts));
$tspH = join('.', $sp);
$tspD = join('.', $tsp);
$d->setDomain(join(".", [$tspH, $tspD]));
if (count($privateParts)) {
$d->setSubdomain(join(".", $privateParts));
}
var_dump($d->getResult());
exit;
return 1;
// return handlePunycode();
}
public function handlePunyCode($domain) {
}
public function isValid($domain) : bool {
$parsedDomains = $this->parse($domain);
return $parsedDomains["domain"] && $parsedDomains["listed"];
}
public function getDomain(string $domain) : array {
return $this->parse($domain);
}
public function findRule(string $domain) {
$punyDomain = idn_to_ascii($domain, IDNA_NONTRANSITIONAL_TO_ASCII, INTL_IDNA_VARIANT_UTS46);
$rules = $this->getRules();
$result = null;
foreach ($rules as $idx => $rule) {
if ($rule["punySuffix"] === -1) {
$rule["punySuffix"] = idn_to_ascii($rule["suffix"], IDNA_NONTRANSITIONAL_TO_ASCII, INTL_IDNA_VARIANT_UTS46);
}
if ($this->endsWith($punyDomain, $rule["punySuffix"]) && ($punyDomain !== $rule["punySuffix"])) {
$length = $this->endsWith($punyDomain, $rule["punySuffix"]);
$result[$idx] = $rule;
$result[$idx]["length"] = $length;
}
}
return array_reduce($result, function ($a, $b) {
return @$a['length'] > $b['length'] ? $a : $b ;
});
}
public function errorCodes(string $error) : string {
$lists = [
'DOMAIN_TOO_SHORT' => 'Domain name too short',
'DOMAIN_TOO_LONG' => 'Domain name too long. It should be no more than 255 chars.',
'LABEL_STARTS_WITH_DASH' => 'Domain name label can not start with a dash.',
'LABEL_ENDS_WITH_DASH' => 'Domain name label can not end with a dash.',
'LABEL_TOO_LONG' => 'Domain name label should be at most 63 chars long.',
'LABEL_TOO_SHORT' => 'Domain name label should be at least 1 character long.',
'LABEL_INVALID_CHARS' => 'Domain name label can only contain alphanumeric characters or dashes.'
];
return $lists[$error];
}
public static function removeTrailingDot(string $domain) : string {
if (substr($domain, -1, 1) === '.') {
return substr($domain, 0, -1);
}
return $domain;
}
public function validate(string $input) {
if (!self::isValidUtf8($input)) {
$input = utf8_encode($input);
}
// Punycode to ASCII
$ascii = idn_to_ascii($input, IDNA_NONTRANSITIONAL_TO_ASCII, INTL_IDNA_VARIANT_UTS46);
if (strlen($ascii) < 1) {
return 'DOMAIN_TOO_SHORT';
}
if (strlen($ascii) > 255) {
return 'DOMAIN_TOO_LONG';
}
$labels = explode('.', $ascii);
$label = null;
for ($i = 0; $i < count($labels); $i++) {
$label = $labels[$i];
if (!strlen($label)) {
return 'LABEL_TOO_SHORT';
}
if (strlen($label) > 63) {
return 'LABEL_TOO_LONG';
}
if (substr($label, 0, 1) === '-') {
return 'LABEL_STARTS_WITH_DASH';
}
if (substr($label, -1, 1) === '-') {
return 'LABEL_ENDS_WITH_DASH';
}
if (!preg_match('/^[a-z0-9\-]+$/', $label)) {
return 'LABEL_INVALID_CHARS';
}
}
}
public static function isValidUtf8(string $input) : bool {
if (preg_match(/** @lang string $input */ '%^(?:
[\x09\x0A\x0D\x20-\x7E] # ASCII
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
)*$%xs', $input)) {
return true;
}
return false;
}
private function getRules() : array
{
$rules = file_get_contents('./data/rules.json');
$rulesData = json_decode($rules);
$data = [];
foreach ($rulesData as $key => $rule) {
$data[$key]['rule'] = $rule;
$data[$key]['suffix'] = preg_replace('/^(\*\.|\!)/', '', $rule);
$data[$key]['punySuffix'] = -1;
$data[$key]['wildcard'] = strpos($rule, 0, 1) === '*';
$data[$key]['exception'] = strpos($rule, 0, 1) === '!';
}
return $data;
}
public static function endsWith(string $haystack, string $needle) : ?int {
if (substr_compare($haystack, $needle, -strlen($needle)) === 0) {
return strlen($needle);
$hostCount = strlen(strstr($haystack, ".", true));
$tldCount = strlen("." . $needle);
$domainCount = strlen($haystack);
return $domainCount === ($tldCount + $hostCount);
}
return false;
}
private function endWith(string $punyDomain, string $suffix) : int
{
$haystack = "abc.com.cn";
$needle = "com.cn";
var_dump($suffix);
var_dump(strlen($punyDomain) - strlen($suffix));
// var_dump(strpos($punyDomain, strlen($punyDomain) - strlen($suffix)));
return strpos($punyDomain, strlen($punyDomain) - strlen($suffix)) !== false;
}
}
// new psl("www.google.co.uk");
// new psl("täst.de.");
// new psl("local");
new psl("食狮.com.cn");
// new psl("a.b.c.d.foo.com");
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment