Last active
November 18, 2021 22:57
-
-
Save yokawasa/e41c1517700ebc6f67df to your computer and use it in GitHub Desktop.
RSS Crawler - Crawling and parsing data with feedparser and storing results into Azure DocumentDB with pydocumentdb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import feedparser | |
import pydocumentdb.documents as documents | |
import pydocumentdb.document_client as document_client | |
import pydocumentdb.errors as errors | |
import pydocumentdb.http_constants as http_constants | |
Docdb_masterKey = '<Your Documentdb master key string>' | |
Docdb_host = 'https://<documentdb account>.documents.azure.com:443/' | |
Docdb_dbname = '<documentdb database name>' | |
Docdb_colname = '<documentdb collection name>' | |
feedurl='http://blogs.msdn.com/b/windowsazurej/atom.aspx' | |
def rsscrawling(): | |
# create documentDb client instance | |
client = document_client.DocumentClient(Docdb_host, | |
{'masterKey': Docdb_masterKey}) | |
# create a database if not yet created | |
database_definition = {'id': Docdb_dbname } | |
databases = list(client.QueryDatabases({ | |
'query': 'SELECT * FROM root r WHERE r.id=@id', | |
'parameters': [ | |
{ 'name':'@id', 'value': database_definition['id'] } | |
] | |
})) | |
if ( len(databases) > 0 ): | |
feeddb = databases[0] | |
else: | |
print "database is created:%s" % Docdb_dbname | |
feeddb = client.CreateDatabase(database_definition) | |
# create a collection if not yet created | |
collection_definition = { 'id': Docdb_colname } | |
collections = list(client.QueryCollections( | |
feeddb['_self'], | |
{ | |
'query': 'SELECT * FROM root r WHERE r.id=@id', | |
'parameters': [ | |
{ 'name':'@id', 'value': collection_definition['id'] } | |
] | |
})) | |
if ( len(collections) > 0 ): | |
collection = collections[0] | |
else: | |
print "collection is created:%s" % Docdb_colname | |
collection = client.CreateCollection( | |
feeddb['_self'], collection_definition) | |
# request & parse rss feed via feedparser | |
feed=feedparser.parse(feedurl) | |
for entry in feed[ 'entries' ]: | |
document_definition = { 'title':entry[ 'title'], | |
'content':entry['description'], | |
'permalink':entry[ 'link' ], | |
'postdate':entry['date'] } | |
# check if duplicated | |
documents = list(client.QueryDocuments( | |
collection['_self'], | |
{ | |
'query': 'SELECT * FROM root r WHERE r.permalink=@permalink', | |
'parameters': [ | |
{ 'name':'@permalink', 'value':document_definition['permalink'] } | |
] | |
})) | |
if (len(documents) < 1): | |
# only create if it's fully new document | |
print "document is added:title:%s" % entry['title'] | |
created_document = client.CreateDocument( | |
collection['_self'], document_definition) | |
if __name__ == '__main__': | |
rsscrawling() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment