Created
May 6, 2021 06:01
-
-
Save marinsagovac/40dd3c46bdf19616b2b06363261bdfc8 to your computer and use it in GitHub Desktop.
psl fork in PHP from JS
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// https://github.com/lupomontero/psl/blob/master/index.js | |
class Domain | |
{ | |
protected ?string $input = null; | |
protected $tld = null; | |
protected $sld = null; | |
protected $domain = null; | |
protected $subdomain = null; | |
protected ?bool $listed = false; | |
public function setInput(string $input) { | |
$this->input = $input; | |
} | |
public function setTld(string $tld) { | |
$this->tld = $tld; | |
} | |
public function setSld(string $sld) { | |
$this->sld = $sld; | |
} | |
public function setDomain(string $domain) { | |
$this->domain = $domain; | |
} | |
public function setSubdomain(string $subdomain) { | |
$this->subdomain = $subdomain; | |
} | |
public function setListed(bool $listed) { | |
$this->listed = $listed; | |
} | |
public function getResult() : array { | |
return [ | |
'input' => $this->input, | |
'tld' => $this->tld, | |
'sld' => $this->sld, | |
'domain' => $this->domain, | |
'subdomain' => $this->subdomain, | |
'listed' => $this->listed | |
]; | |
} | |
public function getSld() : ?string | |
{ | |
return $this->getSld(); | |
} | |
public function getTld() : ?string | |
{ | |
return $this->getTld(); | |
} | |
} | |
class psl | |
{ | |
// parse | |
public function __construct(string $input) { | |
$this->parse($input); | |
} | |
public function parse(string $input) { | |
// Force domain to lowercase | |
$lower = strtolower($input); | |
// FQDN, remove trailing dot | |
$domain = self::removeTrailingDot($lower); | |
// Validate, sanitize input | |
$error = $this->validate($domain); | |
if ($error) { | |
return [ | |
'input' => $input, | |
'error' => [ | |
'message' => $this->errorCodes($error), | |
'code' => $error | |
] | |
]; | |
} | |
$d = new Domain(); | |
$d->setInput($input); | |
$parsed = $d->getResult(); | |
$domainParts = explode(".", $domain); | |
$domainPartsCount = count($domainParts); | |
// Non-Internet TLD | |
if ($domainParts[count($domainParts) - 1] === 'local') { | |
return $parsed; | |
} | |
// return $parsed; | |
$rule = $this->findRule($domain); | |
if (!$rule) { | |
if (count($domainParts) < 2) { | |
return $parsed; | |
} | |
$d->setTld(end($domainParts)); | |
array_pop($domainParts); | |
$d->setSld(end($domainParts)); | |
array_pop($domainParts); | |
$d->setDomain(join('.', [$d->getSld(), $d->getTld()])); | |
if (count($domainParts)) { | |
$d->setSubdomain(end($domainParts)); | |
array_pop($domainParts); | |
} | |
} | |
$d->setListed(true); | |
$tldParts = explode('.', $rule["suffix"]); | |
$tsp = $tldParts; | |
$privateParts = array_splice($domainParts, 0, ($domainPartsCount - count($tldParts))); | |
$sp = $privateParts; | |
if ($rule["exception"]) { | |
array_push($privateParts, array_shift($tldParts)); | |
} | |
$d->setTld(join(".", $tldParts)); | |
if (!count($privateParts)) { | |
return 1; | |
// return handlePunycode(); | |
} | |
// review | |
if ($rule["wildcard"]) { | |
$popedParts = array_pop($privateParts); | |
$tldParts = array_unshift($tldParts, $popedParts); | |
$d->setTld(join(".", $tldParts)); | |
} | |
if (!count($privateParts)) { | |
return 1; | |
// return handlePunycode(); | |
} | |
$d->setSld(array_pop($privateParts)); | |
$tspH = join('.', $sp); | |
$tspD = join('.', $tsp); | |
$d->setDomain(join(".", [$tspH, $tspD])); | |
if (count($privateParts)) { | |
$d->setSubdomain(join(".", $privateParts)); | |
} | |
var_dump($d->getResult()); | |
exit; | |
return 1; | |
// return handlePunycode(); | |
} | |
public function handlePunyCode($domain) { | |
} | |
public function isValid($domain) : bool { | |
$parsedDomains = $this->parse($domain); | |
return $parsedDomains["domain"] && $parsedDomains["listed"]; | |
} | |
public function getDomain(string $domain) : array { | |
return $this->parse($domain); | |
} | |
public function findRule(string $domain) { | |
$punyDomain = idn_to_ascii($domain, IDNA_NONTRANSITIONAL_TO_ASCII, INTL_IDNA_VARIANT_UTS46); | |
$rules = $this->getRules(); | |
$result = null; | |
foreach ($rules as $idx => $rule) { | |
if ($rule["punySuffix"] === -1) { | |
$rule["punySuffix"] = idn_to_ascii($rule["suffix"], IDNA_NONTRANSITIONAL_TO_ASCII, INTL_IDNA_VARIANT_UTS46); | |
} | |
if ($this->endsWith($punyDomain, $rule["punySuffix"]) && ($punyDomain !== $rule["punySuffix"])) { | |
$length = $this->endsWith($punyDomain, $rule["punySuffix"]); | |
$result[$idx] = $rule; | |
$result[$idx]["length"] = $length; | |
} | |
} | |
return array_reduce($result, function ($a, $b) { | |
return @$a['length'] > $b['length'] ? $a : $b ; | |
}); | |
} | |
public function errorCodes(string $error) : string { | |
$lists = [ | |
'DOMAIN_TOO_SHORT' => 'Domain name too short', | |
'DOMAIN_TOO_LONG' => 'Domain name too long. It should be no more than 255 chars.', | |
'LABEL_STARTS_WITH_DASH' => 'Domain name label can not start with a dash.', | |
'LABEL_ENDS_WITH_DASH' => 'Domain name label can not end with a dash.', | |
'LABEL_TOO_LONG' => 'Domain name label should be at most 63 chars long.', | |
'LABEL_TOO_SHORT' => 'Domain name label should be at least 1 character long.', | |
'LABEL_INVALID_CHARS' => 'Domain name label can only contain alphanumeric characters or dashes.' | |
]; | |
return $lists[$error]; | |
} | |
public static function removeTrailingDot(string $domain) : string { | |
if (substr($domain, -1, 1) === '.') { | |
return substr($domain, 0, -1); | |
} | |
return $domain; | |
} | |
public function validate(string $input) { | |
if (!self::isValidUtf8($input)) { | |
$input = utf8_encode($input); | |
} | |
// Punycode to ASCII | |
$ascii = idn_to_ascii($input, IDNA_NONTRANSITIONAL_TO_ASCII, INTL_IDNA_VARIANT_UTS46); | |
if (strlen($ascii) < 1) { | |
return 'DOMAIN_TOO_SHORT'; | |
} | |
if (strlen($ascii) > 255) { | |
return 'DOMAIN_TOO_LONG'; | |
} | |
$labels = explode('.', $ascii); | |
$label = null; | |
for ($i = 0; $i < count($labels); $i++) { | |
$label = $labels[$i]; | |
if (!strlen($label)) { | |
return 'LABEL_TOO_SHORT'; | |
} | |
if (strlen($label) > 63) { | |
return 'LABEL_TOO_LONG'; | |
} | |
if (substr($label, 0, 1) === '-') { | |
return 'LABEL_STARTS_WITH_DASH'; | |
} | |
if (substr($label, -1, 1) === '-') { | |
return 'LABEL_ENDS_WITH_DASH'; | |
} | |
if (!preg_match('/^[a-z0-9\-]+$/', $label)) { | |
return 'LABEL_INVALID_CHARS'; | |
} | |
} | |
} | |
public static function isValidUtf8(string $input) : bool { | |
if (preg_match(/** @lang string $input */ '%^(?: | |
[\x09\x0A\x0D\x20-\x7E] # ASCII | |
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | |
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | |
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte | |
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates | |
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 | |
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 | |
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 | |
)*$%xs', $input)) { | |
return true; | |
} | |
return false; | |
} | |
private function getRules() : array | |
{ | |
$rules = file_get_contents('./data/rules.json'); | |
$rulesData = json_decode($rules); | |
$data = []; | |
foreach ($rulesData as $key => $rule) { | |
$data[$key]['rule'] = $rule; | |
$data[$key]['suffix'] = preg_replace('/^(\*\.|\!)/', '', $rule); | |
$data[$key]['punySuffix'] = -1; | |
$data[$key]['wildcard'] = strpos($rule, 0, 1) === '*'; | |
$data[$key]['exception'] = strpos($rule, 0, 1) === '!'; | |
} | |
return $data; | |
} | |
public static function endsWith(string $haystack, string $needle) : ?int { | |
if (substr_compare($haystack, $needle, -strlen($needle)) === 0) { | |
return strlen($needle); | |
$hostCount = strlen(strstr($haystack, ".", true)); | |
$tldCount = strlen("." . $needle); | |
$domainCount = strlen($haystack); | |
return $domainCount === ($tldCount + $hostCount); | |
} | |
return false; | |
} | |
private function endWith(string $punyDomain, string $suffix) : int | |
{ | |
$haystack = "abc.com.cn"; | |
$needle = "com.cn"; | |
var_dump($suffix); | |
var_dump(strlen($punyDomain) - strlen($suffix)); | |
// var_dump(strpos($punyDomain, strlen($punyDomain) - strlen($suffix))); | |
return strpos($punyDomain, strlen($punyDomain) - strlen($suffix)) !== false; | |
} | |
} | |
// new psl("www.google.co.uk"); | |
// new psl("täst.de."); | |
// new psl("local"); | |
new psl("食狮.com.cn"); | |
// new psl("a.b.c.d.foo.com"); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment