Last active
March 14, 2017 19:34
-
-
Save dahnielson/097628605744715bc346433bb3ac32b6 to your computer and use it in GitHub Desktop.
Sitemap for webscraper.io and Python script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"startUrl": "https://del.icio.us/<your username>?&page=[1-100]", | |
"selectors": [{ | |
"parentSelectors": ["_root"], | |
"type": "SelectorElement", | |
"multiple": true, | |
"id": "bookmark", | |
"selector": "div.articleThumbBlockOuter", | |
"delay": "" | |
}, { | |
"parentSelectors": ["bookmark"], | |
"type": "SelectorText", | |
"multiple": false, | |
"id": "title", | |
"selector": "a.title", | |
"regex": "", | |
"delay": "" | |
}, { | |
"parentSelectors": ["bookmark"], | |
"type": "SelectorElementAttribute", | |
"multiple": false, | |
"id": "link", | |
"selector": "p:nth-of-type(1) a", | |
"delay": "", | |
"extractAttribute": "href" | |
}, { | |
"parentSelectors": ["bookmark"], | |
"type": "SelectorGroup", | |
"id": "tag", | |
"selector": "ul.tagName a", | |
"extractAttribute": "", | |
"delay": "" | |
}, { | |
"parentSelectors": ["bookmark"], | |
"type": "SelectorElementAttribute", | |
"multiple": false, | |
"id": "date", | |
"selector": "_parent_", | |
"extractAttribute": "date", | |
"delay": "" | |
}, { | |
"parentSelectors": ["bookmark"], | |
"type": "SelectorElementAttribute", | |
"multiple": false, | |
"id": "private", | |
"selector": "div.articleThumbBlock", | |
"extractAttribute": "class", | |
"delay": "" | |
}], | |
"_id": "delicious" | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import json | |
import urllib.request | |
import urllib.parse | |
from datetime import datetime | |
bookmark_file = '<exported csv file name>' | |
pinboard_token = '<your pinboard token>' | |
def process_tags(csv_tags): | |
json_tags = json.loads('{ "tags": %s }' % csv_tags) | |
json_tags = json_tags['tags'] | |
tags = [] | |
for json_tag in json_tags: | |
tags.append(json_tag['tag']) | |
return tags | |
with open(bookmark_file, newline='', encoding="utf8") as csvfile: | |
bookmarks = csv.reader(csvfile, delimiter=',', quotechar='"') | |
next(bookmarks) # Skip header | |
for bookmark in bookmarks: | |
bookmark_post_data = { | |
'url': bookmark[1], | |
'description': bookmark[0], | |
'dt': datetime.fromtimestamp(int(bookmark[3]), tz=None).isoformat(), | |
'shared': 'no' if 'privateArticle' in bookmark[4].split() else 'yes' | |
} | |
tags = process_tags(bookmark[2]) | |
if len(bookmark[2]) > 0: | |
bookmark_post_data['tags'] = ",".join(tags) | |
bookmark_post_data['auth_token'] = pinboard_token | |
print("Adding %s" % bookmark[1]) | |
params = urllib.parse.urlencode(bookmark_post_data) | |
f = urllib.request.urlopen("https://api.pinboard.in/v1/posts/add?%s" % params) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment