Created
August 24, 2015 17:56
-
-
Save apendleton/caddc194dde259a08a31 to your computer and use it in GitHub Desktop.
Link grabbing example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# first you need to make sure you have the libraries installed; I'm using pyquery and requests: | |
# pip install pyquery requests | |
import requests | |
from pyquery import PyQuery as pq | |
from urlparse import urljoin | |
URL = "http://www.stat-gabon.org/" | |
response = requests.get(URL) | |
page = pq(response.content) | |
# print all the URLs on the page | |
print 'all links' | |
print [link.attr('href') for link in page('a[href]').items()] | |
# the same as above, but with absolute URLs | |
print 'all links with absolute urls' | |
print [urljoin(URL, link.attr('href')) for link in page('a[href]').items()] | |
# the same above, but with the URL labels as well | |
print 'all links, absolute urls, and link text' | |
print [(link.text(), urljoin(URL, link.attr('href'))) for link in page('a[href]').items()] | |
# the same as above, but only for links that have 'pdf' in them somewhere | |
print 'pdf links, absolute urls, and link text' | |
print [(link.text(), urljoin(URL, link.attr('href'))) for link in page('a[href]').items() if 'pdf' in link.attr('href').lower()] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment