Last active
October 17, 2017 02:37
-
-
Save epitron/10011220 to your computer and use it in GitHub Desktop.
An Instapaper scraper.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'mechanize' | |
USERNAME = "" | |
PASSWORD = "" | |
# TODO: Save cookies with "http.cookie_jar.{load,save} filename" | |
# TODO: Store password in ~/.config or some kind of wallet | |
http = Mechanize.new do |a| | |
a.user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.149 Safari/537.36" | |
a.verify_mode = OpenSSL::SSL::VERIFY_NONE | |
end | |
puts "Logging in..." | |
page = http.get("http://www.instapaper.com/") | |
page = page.links.find { |l| l.text["Sign In"] }.click | |
page = page.form_with action: "/user/login" do |form| | |
form.username = USERNAME | |
form.password = PASSWORD | |
end.click_button | |
puts "Clicking /user" | |
page = page.link_with(href: "/user").click | |
puts "Clicking /user/export" | |
page = page.link_with(href: "/user/export").click | |
puts "Clicking /export/csv" | |
csv = page.form_with(action: "/export/csv").click_button | |
date = Time.now.strftime("%Y-%m-%d") | |
outfile = File.expand_path "~/backup/instapaper/instapaper-#{date}.csv" | |
puts "Saving to #{outfile.inspect}..." | |
bytes = csv.save! outfile | |
puts "Done! (#{bytes} bytes written)" | |
system("~/backup/instapaper/merge") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'epitools' | |
all_csvs = Path["~/backup/instapaper/instapaper-20*.csv"]. | |
sort. | |
map { |path| [path, CSV.read(path)] } | |
def find_row(needle, haystack) | |
# p needle: needle | |
# raise if haystack.size > 1 | |
haystack.index { |row| row[0..1] == needle[0..1] } | |
end | |
puts | |
puts "* Merging #{all_csvs.size} CSVs..." | |
puts | |
merged = [] | |
all_csvs.each do |path, csv| | |
puts "* #{path}" | |
headers = csv.shift | |
merged << headers if merged.empty? | |
csv.reverse! | |
enum = csv.to_enum | |
count = 0 | |
loop do | |
row = enum.next | |
if merge_pos = find_row(row, merged) | |
puts " |_ Overlap at row #{merge_pos}" | |
merged[merge_pos..-1] = [] # trim | |
merged += csv | |
break | |
end | |
count += 1 | |
#p try: count | |
if count >= 10 | |
#p :pos_not_found | |
puts " |_ No overlap" | |
merged += csv | |
break | |
end | |
end | |
end | |
outfile = File.expand_path "~/backup/instapaper.csv" | |
puts | |
puts "* Done! Total rows: #{merged.size}" | |
puts "* Writing to #{outfile}..." | |
CSV.open(outfile, "w") do |csv| | |
merged.each {|row| csv << row } | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment