scrapy_nokogiri.md

Scrapy

scrapybook
scrapy shell
custom link filtering

pip install ipython
scrapy shell http://www.example.com

item['status'] = response.status
item['url'] = response.url

response.__dict__

# extract all links
from collections import OrderedDict
import urlparse

all_links = sel.select("//a/@href").extract()

# remove duplicates
all_links = list(OrderedDict.fromkeys(all_links))

# transform links to absolute
response_url = response.url
urlparse.urljoin(response_url, link.strip())

# validate mailto
re.compile(".+@.+\..+").match("mailto:[email protected]") == None

data = Nokogiri::XML(File.open('sitemap.xml'))

data.remove_namespaces!
data.xpath('//loc').length

links = []
data.xpath('//loc').each { |link| links << link.content }

File.open('sitemap_links.txt', 'w') do |f|
  f.write links.join "\n"
end

bootstraponline/scrapy_nokogiri.md

Select an option

No results found

Select an option

No results found

Scrapy