Created
November 20, 2016 03:43
-
-
Save samcv/1d0086fcb06a9e8be858e9357d04ecf0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use v6; | |
=TITLE URL::Find | |
=SUBTITLE A Perl 6 module to find all the URL's in a set of text. | |
=head1 DESCRIPTION | |
=para | |
By default it will match domain names | |
that use unicode characters such as http://правительство.рф. To only match ASCII domains use the | |
:ascii option. It will also find URL's that end in one of the restricted characters, so | |
`https://www.google.com, ` will pull out `https://www.google.com`. It will find all the URL's in a | |
text by default, or you can specify a maximum number with the :limit option. By default it will | |
only find http, https, ftp, git and ssh schemes, but you can specify `:any<1>` to match any schemes | |
with legal characters.. | |
grammar url { | |
has $.ascii = 0; | |
has $.any = 0; | |
regex TOP { <anyprotocol> '://' <base> [<after>+]? '/'? } | |
token anyprotocol { <[ a..z A..Z ]> <[ a..z A..Z 0..9 . + - ]>+ } | |
token protocol {:i [http|https|ftp|git|ssh] } | |
token baseascii { [ <[a..z A..Z 0..9 \- . ]> ]+ } | |
token base { [ <:Number + :Letter + [ . - ]> ]+ } | |
token protected { <[ $ + ! * ( ) , . ; ? @ = % & # " ' ]> } | |
token allowed { \S } | |
regex term { <allowed>+ <!after <protected>> } | |
token after { '/' <term> } | |
} | |
class url-actions { | |
method TOP ($/) { | |
make { | |
protocol => $<anyprotocol>.made, | |
#url => $<anyprotocol>.made ~ '://' ~ $<base> ~ $<after>.join | |
} | |
} | |
method protocol ($/) { | |
make $/.lc; | |
} | |
method anyprotocol ($/) { | |
make $/.lc; | |
} | |
} | |
#| Accepts a string and returns a list of URL's. Optionally you can specify a limit to the number | |
#| of URL's returned, or whether you want to only match URL's with ASCII domain names: :ascii<1> | |
#| Matches only http https ftp git and ssh schemes by default. To match any scheme, use :any<1> | |
sub find-urls ( Str $string, Num :$limit? is copy, :$ascii?, :$any? ) is export { | |
$limit = ∞ if ! $limit.defined; | |
my $base = $ascii ?? <url:baseascii> !! <url:base>; | |
my $protocol = $any ?? <url:anyprotocol> !! <url:protocol>; | |
my $match = url.parse($string, :actions(url-actions.new) ); | |
say $match.made.perl; | |
#say $match; | |
#my $url-regex = regex { <{ $any ?? &url:anyprotocol !! <protocol }> '://' <$base> [<after>+]? '/'? }; | |
#$string.comb($url-regex, $limit); | |
} | |
find-urls("https://google.com/something"); | |
=AUTHOR Samantha McVey (samcv) [email protected] | |
=LICENSE | |
This is free software; you can redistribute it and/or modify it under | |
the Artistic License 2.0. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment