Skip to content

Instantly share code, notes, and snippets.

@umidjons
Last active October 25, 2016 17:45
Show Gist options
  • Save umidjons/bdfd6517018c8aa45459d5acbdafc986 to your computer and use it in GitHub Desktop.
Save umidjons/bdfd6517018c8aa45459d5acbdafc986 to your computer and use it in GitHub Desktop.
Web page parser using DOMDocument class

Web page parser using DOMDocument class

Source from the book PHP 7 Programming Cookbook by Doug Bierer with some improvements.

File Application/Web/Parser.php:

<?php
namespace Application\Web;

use \DOMDocument, \DOMNodeList, \DOMElement, \DOMAttr;

class Parser
{
    protected $content;
    protected $url;

    public function __construct($url)
    {
        $this->url = $url;
    }

    public function getContent()
    {
        if (!$this->content) {
            if (stripos($this->url, 'http' !== 0)) {
                $this->url = 'http://' . $this->url;
            }

            $this->content = new DOMDocument('1.0', 'utf-8');
            $this->content->preserveWhiteSpace = false;
            @$this->content->loadHTMLFile($this->url);
        }

        return $this->content;
    }

    public function getTags($tag)
    {
        $result = [];

        /* @var DOMNodeList $elements */
        $elements = $this->getContent()->getElementsByTagName($tag);

        /* @var DOMElement $node */
        foreach ($elements as $node) {
            $item['value'] = trim(preg_replace('/\s+/', ' ', $node->nodeValue));
            if ($node->hasAttributes()) {
                /* @var DOMAttr $attr */
                foreach ($node->attributes as $name => $attr) {
                    $item['attributes'][$name] = $attr->value;
                }
            }
            $result[] = $item;
        }

        return $result;
    }

    public function getAttribute($attr, $domain = null)
    {
        $result = [];

        /* @var DOMNodeList $elements */
        $elements = $this->getContent()->getElementsByTagName('*');

        /* @var DOMElement $node */
        foreach ($elements as $node) {
            if ($node->hasAttribute($attr)) {
                $value = $node->getAttribute($attr);
                if ($domain) {
                    if (stripos($value, $domain) !== false) {
                        $result[] = trim($value);
                    }
                } else {
                    $result[] = trim($value);
                }
            }
        }

        return $result;
    }
}

File index.php:

<?php
require 'Application\Autoload\Loader.php';
\Application\Autoload\Loader::init(__DIR__);

$url = 'http://oreilly.com';

$parser = new Application\Web\Parser($url);

echo 'All "img" tags:<br><pre>', print_r($parser->getTags('img'), true), '</pre>';
echo 'All "href" attributes:<br><pre>', print_r($parser->getAttribute('href'), true), '</pre>';
echo 'All "href" attributes with "css" domain:<br><pre>',
print_r($parser->getAttribute('href', 'css'), true),
'</pre>';

Note: class is loading using autoloader from this gist

Sample output:

All "img" tags:
Array
(
    [0] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //cdn.oreillystatic.com/oreilly/promos/ba-security-ny-20161019.png
                    [width] => 720
                    [height] => 298
                    [alt] => O'Reilly Security Conference in New York, NY, October 31 � November 2, 2016. See what you'll learn.
                )

        )

    [1] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //cdn.oreillystatic.com/oreilly/promos/ba-sa-ca-20161013.png
                    [width] => 720
                    [height] => 298
                    [alt] => O'Reilly Software Architecture Conference in San Francisco, CA, November 14-16, 2016. See what you'll learn.
                )

        )

    [2] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //cdn.oreillystatic.com/oreilly/promos/ba-live-training-cal-20160916.png
                    [width] => 724
                    [height] => 298
                    [alt] => Live training events calendar. See all upcoming events.
                )

        )

    [3] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //akamaicovers.oreilly.com/images/0636920047124/rc_thumb.gif
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                )

        )

    [4] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //akamaicovers.oreilly.com/images/9781680502008/thumb.gif
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                )

        )

    [5] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //akamaicovers.oreilly.com/images/9780994347008/thumb.gif
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                )

        )

    [6] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //akamaicovers.oreilly.com/images/0636920051961/thumb.gif
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                )

        )

    [7] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //akamaicovers.oreilly.com/images/0636920055570/thumb.gif
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                )

        )

    [8] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //akamaicovers.oreilly.com/images/0636920054993/thumb.gif
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                )

        )

    [9] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //akamaicovers.oreilly.com/images/9781593277413/thumb.gif
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                )

        )

    [10] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //akamaicovers.oreilly.com/images/0636920055594/thumb.gif
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                )

        )

    [11] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //akamaicovers.oreilly.com/images/0636920047506/thumb.gif
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                )

        )

    [12] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //akamaicovers.oreilly.com/images/0636920049517/thumb.gif
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                )

        )

    [13] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //akamaicovers.oreilly.com/images/9781593277604/thumb.gif
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                )

        )

    [14] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //akamaicovers.oreilly.com/images/0636920051732/rc_thumb.gif
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                )

        )

    [15] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //akamaicovers.oreilly.com/images/0636920041504/thumb.gif
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                )

        )

    [16] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //akamaicovers.oreilly.com/images/0636920031833/thumb.gif
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                )

        )

    [17] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //akamaicovers.oreilly.com/images/0636920044079/rc_thumb.gif
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                )

        )

    [18] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //akamaicovers.oreilly.com/images/0636920047391/thumb.gif
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                )

        )

    [19] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //akamaicovers.oreilly.com/images/0636920042228/thumb.gif
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                )

        )

    [20] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //akamaicovers.oreilly.com/images/0636920031130/thumb.gif
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                )

        )

    [21] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //akamaicovers.oreilly.com/images/0636920052654/thumb.gif
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                )

        )

    [22] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //akamaicovers.oreilly.com/images/0636920052616/thumb.gif
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                )

        )

    [23] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //cdn.oreillystatic.com/oreilly/promos/homepage-newsletter-quote-20160525.png
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                    [style] => margin:10px auto 15px;
                )

        )

    [24] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //cdn.oreillystatic.com/oreilly/promos/online-training-photo-20160603.jpg
                    [width] => 724
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                    [style] => position:absolute; z-index:1; top:0; left:0; display:block;
                )

        )

    [25] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => http://cdn.oreillystatic.com/oreilly/promos/safari-logo-202x57.png
                    [width] => 150
                    [height] => 298
                    [alt] => Safari
                    [itemprop] => image
                    [style] => position:absolute; z-index:1; top:0; left:0; display:block;
                    [class] => mb
                )

        )

    [26] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => http://covers.oreillystatic.com/images/0636920050612/rc_bkt.gif
                    [width] => 150
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                    [style] => position:absolute; z-index:1; top:0; left:0; display:block;
                    [class] => floatl book-cover media
                )

        )

    [27] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => http://covers.oreillystatic.com/images/9780128020425/bkt.gif
                    [width] => 150
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                    [style] => position:absolute; z-index:1; top:0; left:0; display:block;
                    [class] => floatl book-cover media
                )

        )

    [28] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => http://covers.oreillystatic.com/images/0636920045052/thumb.gif
                    [width] => 150
                    [height] => 298
                    [alt] => 
                    [itemprop] => image
                    [style] => position:absolute; z-index:1; top:0; left:0; display:block;
                    [class] => video-cover media
                )

        )

    [29] => Array
        (
            [value] => 
            [attributes] => Array
                (
                    [src] => //cdn.oreillystatic.com/images/sitewide-headers/tarsier-footer.png
                    [width] => 150
                    [height] => 298
                    [alt] => Tarsier
                    [itemprop] => image
                    [style] => position:absolute; z-index:1; top:0; left:0; display:block;
                    [class] => video-cover media
                )

        )

)
All "href" attributes:
Array
(
    [0] => https://plus.google.com/108442503368488643007
    [1] => http://www.oreilly.com
    [2] => //www.oreilly.com/favicon.ico
    [3] => //cdn.oreillystatic.com/oreilly/ml/css/stylesheet.css
    [4] => //cdn.oreillystatic.com/oreilly/ml/css/ml.css
    [5] => //www.oreilly.com/css/oreilly.css
    [6] => //cdn.oreillystatic.com/assets/css/norm-home-160908.css
    [7] => http://feeds.feedburner.com/oreilly/newbooks
    [8] => http://feeds.feedburner.com/oreilly/upcomingbooks
    [9] => http://feeds.feedburner.com/oreilly/ebookdealoftheday
    [10] => //oreilly.com
    [11] => //www.oreilly.com/ideas
    [12] => //www.oreilly.com/learning
    [13] => //www.oreilly.com/conferences/
    [14] => //shop.oreilly.com/
    [15] => http://members.oreilly.com
    [16] => /topics/ai
    [17] => /topics/economy
    [18] => /topics/business
    [19] => /topics/data
    [20] => /topics/design
    [21] => /topics/operations
    [22] => /topics/security
    [23] => /topics/software-engineering
    [24] => /topics/software-architecture
    [25] => https://www.oreilly.com/topics
    [26] => #
    [27] => #
    [28] => #
    [29] => http://conferences.oreilly.com/security/network-data-security-ny
    [30] => http://conferences.oreilly.com/software-architecture/engineering-business-ca
    [31] => http://www.oreilly.com/live-training/
    [32] => http://shop.oreilly.com/category/new.do
    [33] => //shop.oreilly.com/product/0636920047124.do
    [34] => //shop.oreilly.com/product/0636920047124.do
    [35] => //shop.oreilly.com/product/9781680502008.do
    [36] => //shop.oreilly.com/product/9781680502008.do
    [37] => //shop.oreilly.com/product/9780994347008.do
    [38] => //shop.oreilly.com/product/9780994347008.do
    [39] => //shop.oreilly.com/product/0636920051961.do
    [40] => //shop.oreilly.com/product/0636920051961.do
    [41] => //shop.oreilly.com/product/0636920055570.do
    [42] => //shop.oreilly.com/product/0636920055570.do
    [43] => //shop.oreilly.com/product/0636920054993.do
    [44] => //shop.oreilly.com/product/0636920054993.do
    [45] => //shop.oreilly.com/product/9781593277413.do
    [46] => //shop.oreilly.com/product/9781593277413.do
    [47] => //shop.oreilly.com/product/0636920055594.do
    [48] => //shop.oreilly.com/product/0636920055594.do
    [49] => //shop.oreilly.com/product/0636920047506.do
    [50] => //shop.oreilly.com/product/0636920047506.do
    [51] => //shop.oreilly.com/product/0636920049517.do
    [52] => //shop.oreilly.com/product/0636920049517.do
    [53] => //shop.oreilly.com/product/9781593277604.do
    [54] => //shop.oreilly.com/product/9781593277604.do
    [55] => //shop.oreilly.com/product/0636920051732.do
    [56] => //shop.oreilly.com/product/0636920051732.do
    [57] => //shop.oreilly.com/product/0636920041504.do
    [58] => //shop.oreilly.com/product/0636920041504.do
    [59] => //shop.oreilly.com/product/0636920031833.do
    [60] => //shop.oreilly.com/product/0636920031833.do
    [61] => //shop.oreilly.com/product/0636920044079.do
    [62] => //shop.oreilly.com/product/0636920044079.do
    [63] => //shop.oreilly.com/product/0636920047391.do
    [64] => //shop.oreilly.com/product/0636920047391.do
    [65] => //shop.oreilly.com/product/0636920042228.do
    [66] => //shop.oreilly.com/product/0636920042228.do
    [67] => //shop.oreilly.com/product/0636920031130.do
    [68] => //shop.oreilly.com/product/0636920031130.do
    [69] => //shop.oreilly.com/product/0636920052654.do
    [70] => //shop.oreilly.com/product/0636920052654.do
    [71] => //shop.oreilly.com/product/0636920052616.do
    [72] => //shop.oreilly.com/product/0636920052616.do
    [73] => #
    [74] => #
    [75] => http://www.oreilly.com/emails/newsletters/
    [76] => http://www.oreilly.com/emails/newsletters/
    [77] => http://www.oreilly.com/jobs/
    [78] => http://jobs.jobvite.com/oreilly-media/job/oP683fwQ
    [79] => http://jobs.jobvite.com/oreilly-media/job/oPUu3fw0
    [80] => http://www.oreilly.com/jobs/
    [81] => http://www.oreilly.com/live-training/
    [82] => http://shop.oreilly.com/category/videos.do
    [83] => http://shop.oreilly.com/category/videos.do
    [84] => https://www.safaribooksonline.com/?utm_medium=referral&utm_campaign=publisher&utm_source=oreilly&utm_content=homepage
    [85] => https://www.safaribooksonline.com/?utm_medium=referral&utm_campaign=publisher&utm_source=oreilly&utm_content=homepage
    [86] => http://shop.oreilly.com/product/0636920050612.do?code=DEAL
    [87] => http://shop.oreilly.com/product/0636920050612.do?code=DEAL
    [88] => http://shop.oreilly.com/product/9780128020425.do?code=MSDEAL
    [89] => http://shop.oreilly.com/product/9780128020425.do?code=MSDEAL
    [90] => http://shop.oreilly.com/product/0636920045052.do?code=VDWK
    [91] => http://shop.oreilly.com/product/0636920045052.do?code=VDWK
    [92] => http://support.oreilly.com/oreilly?from_gsfn=true
    [93] => //oreilly.com/about/
    [94] => //oreilly.com/work-with-us.html
    [95] => //shop.oreilly.com/category/customer-service.do
    [96] => //www.oreilly.com/about/contact.html
    [97] => http://fb.co/OReilly
    [98] => http://twitter.com/oreillymedia
    [99] => https://www.youtube.com/user/OreillyMedia
    [100] => https://plus.google.com/+oreillymedia
    [101] => https://www.linkedin.com/company/o%27reilly-media
    [102] => //oreilly.com
    [103] => //oreilly.com/terms/
    [104] => //oreilly.com/privacy.html
    [105] => //www.oreilly.com/about/editorial_independence.html
)
All "href" attributes with "css" domain:
Array
(
    [0] => //cdn.oreillystatic.com/oreilly/ml/css/stylesheet.css
    [1] => //cdn.oreillystatic.com/oreilly/ml/css/ml.css
    [2] => //www.oreilly.com/css/oreilly.css
    [3] => //cdn.oreillystatic.com/assets/css/norm-home-160908.css
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment