Created
December 7, 2011 13:06
-
-
Save wflanagan/1442731 to your computer and use it in GitHub Desktop.
complete links function
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def complete_links(opts = {}) | |
return @complete_links unless @complete_links.blank? | |
link_list = if opts[:limit] | |
links.slice(0..opts[:limit]) | |
else | |
links | |
end | |
@complete_links = [] | |
index = [] | |
return @complete_links unless link_list.is_a?(Nokogiri::XML::NodeSet) # to try to solve bug iwth bad encoding on a nodeset | |
link_list.each do |link| | |
begin | |
unless link.class.to_s.include?("Nokogiri::XML::Element") | |
log "TYPHOEUSPATCH: ERROR: NonXMLElement #{link.inspect}" | |
next | |
end | |
rescue => e | |
puts "TYPHOEUS: ERROR: #{e.inspect} #{e.backtrace.first}" | |
end | |
begin | |
next if link["href"].nil? | |
next if link["href"].length == 0 | |
next if index.include?(link["href"]) | |
next if link["href"].downcase.scan(/^javascript\:/).length > 0 | |
next if link["href"].downcase.scan(/^mailto\:/).length > 0 | |
rescue => e | |
puts "TYPHOEUSPATCH: ERROR: #{link["href"]} #{e.inspect} #{e.backtrace.first}" | |
next | |
end | |
link_info = {} | |
begin | |
begin | |
# gsubbing the link hrefs/urls to get rid of escaped characters that impact our ability to do our job | |
parameter_cleaned_href = link["href"].gsub("%25", "%").gsub("%3D", "=").gsub("%26", "&").gsub("%3A", ":").gsub("%2F", "/").gsub("%3F", "?").gsub("%3D", "=").gsub("%25", "%").gsub("%3B", ";") unless link["href"].nil? | |
parameter_cleaned_href = parameter_cleaned_href.gsub("%3A", ":").gsub("%2F", "/") unless parameter_cleaned_href.nil? | |
x = Addressable::URI.parse(url).join(parameter_cleaned_href) | |
rescue => e | |
puts "TYPHOEUSPATCH: Error: ParseError: #{link["href"]}" | |
next | |
end | |
x.path = x.path.squeeze('/') | |
x.query = nil if x.query && x.query.empty? | |
x.fragment = nil | |
x.path = "/" if x.path.nil? | |
# x = PostRank::URI.c18n(PostRank::URI.unescape(x.to_s)) | |
link_info['href'] = link['href'] | |
link_info['ref'] = link['ref'] | |
link_info['rel'] = link['rel'] | |
link_info['title'] = link['title'] | |
link_info['alt'] = link['alt'] | |
imgz = link.css('img') | |
img_array = [] | |
imgz.each do |i| | |
img_array << {:src => i["src"], :alt => i["alt"], :title => i["title"]} | |
end | |
link_info['images'] = img_array | |
link_info['text'] = link.text.to_s | |
link_info['url'] = x.to_s | |
link_info['absolute_url'] = x.to_s | |
info = link_info_for_pagelink(link_info).stringify_keys | |
@complete_links << info | |
index << link["href"] | |
rescue => e | |
puts "TYPHOEUSPATCH: ERROR: ParseError: #{link["href"]}" | |
next | |
end | |
end | |
@complete_links | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment