pip install ipython
scrapy shell http://www.example.com
item['status'] = response.status
item['url'] = response.url
response.__dict__
# extract all links
from collections import OrderedDict
import urlparse
all_links = sel.select("//a/@href").extract()
# remove duplicates
all_links = list(OrderedDict.fromkeys(all_links))
# transform links to absolute
response_url = response.url
urlparse.urljoin(response_url, link.strip())
# validate mailto
re.compile(".+@.+\..+").match("mailto:[email protected]") == None
data = Nokogiri::XML(File.open('sitemap.xml'))
data.remove_namespaces!
data.xpath('//loc').length
links = []
data.xpath('//loc').each { |link| links << link.content }
File.open('sitemap_links.txt', 'w') do |f|
f.write links.join "\n"
end