-
-
Save kj54321/5fe50bca06576d0b092df95aa2a025a7 to your computer and use it in GitHub Desktop.
範例程式碼:從蟒蛇到神龍 - 從 1 接關繼續打造爬蟲程式 @ 台中拍聚會
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
target_url = "https://www.khanacademy.org/" | |
# ------------------------------------------------------- | |
import whois | |
#print whois.whois(target_url) | |
# ------------------------------------------------------- | |
import requests | |
#resp = requests.get(target_url + "/robots.txt") | |
#print resp.text | |
#import robotparser # try and play it | |
# ------------------------------------------------------- | |
import builtwith | |
import pprint; p=pprint.pprint; import pdb; pdb.set_trace(); | |
print builtwith.parse(target_url) | |
# ------------------------------------------------------- | |
# curl vs. httpie | |
# !curl http://www.meetup.com/Taichung-Python-Meetup/events/227386858/ | |
# !http http://www.meetup.com/Taichung-Python-Meetup/events/227386858/ | |
# ------------------------------------------------------- | |
#from scrapely import Scraper | |
#s = Scraper() | |
#train_url = "http://pypi.python.org/pypi/w3lib/1.1" | |
#data = {'name': 'w3lib 1.1', 'author': 'Scrapy project', 'description': 'Library of web-related functions'} | |
#s.train(train_url, data) | |
#test_url = "http://pypi.python.org/pypi/parrot/0.0.9" | |
#print s.scrape(test_url) | |
# ------------------------------------------------------- | |
from pyquery import PyQuery as pq | |
target2_url = "https://www.khanacademy.org/" | |
#d = pq(url=target2_url) | |
#links = [ el.attrib.get("href", False) for el in d("a")] | |
#import pprint; p=pprint.pprint; import pdb; pdb.set_trace(); | |
#print links | |
#print len(links) | |
# ------------------------------------------------------- | |
from splinter import Browser | |
with Browser() as browser: | |
browser.visit(target2_url) | |
elements = browser.find_by_tag("a") | |
links = [ el["href"] for el in elements] | |
import pprint; p=pprint.pprint; import pdb; pdb.set_trace(); | |
print links | |
print len(links) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment