Last active
September 29, 2016 18:55
-
-
Save coreyhermanson/cfb03fb3b41bcefd0f9ca1e29c7c1035 to your computer and use it in GitHub Desktop.
Example JSON requests for Website, Deep Web, and RSS harvests using the BrightPlanet Harvest API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
EXAMPLE JSON PAYLOADS FOR BRIGHTPLANET HARVEST API | |
================================================= | |
1. Website harvest - scraping search results pages | |
2. Website harvest - harvesting a list of URLs, includes Xpath overwrite and Date-finding Xpath | |
3. Website harvest - scheduled harvest to monitor new documents | |
4. Deep Web harvest - query search engines (USE SPARINGLY - rate limits) | |
5. Deep Web harvest - query sources from multiple source groups | |
6. RSS harvest - monitor new documents daily using RSS feeds, includes Xpath overwrite and Date-finding Xpath | |
7. XPATH expressions - use these xpaths to manipulate which text is harvested from a web page | |
================================================= | |
1. Website Harvest at depth=1, skipLevels=1 w/ URL path filter and filterQuery (scraping search results pages) | |
{ | |
"id": "string", | |
"harvestEventType": "SITE", | |
"scheduleType": "ONCE", | |
"name": "website_searchResults_depth1", | |
"siteHarvestParameters": { | |
"startingUrls": [ | |
"http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer", | |
"http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=11", | |
"http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=21", | |
"http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=31", | |
"http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=41", | |
"http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=51", | |
"http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=61", | |
"http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=71", | |
"http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=81", | |
"http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=91" | |
], | |
"siteMaxDepth": 1, | |
"siteMaxDepthExternal": 0, | |
"skipLevels": 1 | |
}, | |
"filterQuery": "cancer OR lymphoma OR (breast AND diagnose)", | |
"tags": [ | |
"source_News", "topic_Science" | |
], | |
"inclusionDomains": [ | |
"path:/pub_releases/" | |
], | |
"maxDocCount": 1000, | |
"maxDocSize": -1 | |
} | |
------------------------------------------------- | |
2. Website Harvest at depth=0 w/ xpath overwrite and filterQuery | |
{ | |
"id": "string", | |
"harvestEventType": "SITE", | |
"scheduleType": "ONCE", | |
"name": "website_XpathOverwrite", | |
"siteHarvestParameters": { | |
"startingUrls": [ | |
"http://www.breitbart.com/2016-presidential-race/2016/07/10/chiat-hitlers-rise-power-explains-trump/" | |
], | |
"siteMaxDepth": 0, | |
"siteMaxDepthExternal": 0, | |
"skipLevels": 0 | |
}, | |
"filterQuery": "trump", | |
"xpaths": [ | |
"textArticle:overwrite=true", | |
"title://title", | |
"textArticle://div[@class='entry-content']", | |
"dateNode:((//*[contains(local-name(), 'time')]|//*[contains(local-name(), 'Time')]|//*[contains(local-name(), 'date')]|//*[contains(local-name(), 'Date')])/text()[normalize-space()])[1]", | |
"dateAttrib:((//*[(contains(@itemprop|@class|@id, 'Date') or contains(@itemprop|@class|@id, 'Time') or contains(@itemprop|@class|@id, 'Posted') or contains(@itemprop|@class|@id, 'Publish') or contains(@itemprop|@class|@id, 'date') or contains(@itemprop|@class|@id, 'time') or contains(@itemprop|@class|@id, 'posted') or contains(@itemprop|@class|@id, 'publish')) and not(contains(@itemprop|@class|@id, 'Timeline')) and not(contains(@itemprop|@class|@id, 'timeline'))])/text()[normalize-space()])[1] ", | |
"dateMeta:(//meta[contains(@class|@itemprop|@id, 'date') or contains(@class|@itemprop|@id, 'time') or contains(@class|@itemprop|@id, 'Date') or contains(@class|@itemprop|@id, 'Time')])[1]/@content ", | |
"dateText:(//*[text()[contains(.,'201') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]]|//*[text()[contains(.,'200') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]])[position() < 4]" | |
], | |
"tags": [ | |
"source_News", "topic_Finance" | |
], | |
"maxDocCount": 25, | |
"maxDocSize": -1 | |
} | |
------------------------------------------------- | |
3. Website harvest scheduled for every two days to harvest new documents | |
{ | |
"id": "string", | |
"harvestEventType": "SITE", | |
"scheduleType": "RECURRING", | |
"name": "website_scheduled", | |
"siteHarvestParameters": { | |
"startingUrls": [ | |
"http://www.bloomberg.com/technology" | |
], | |
"siteMaxDepth": 1, | |
"siteMaxDepthExternal": 0, | |
"skipLevels": 1 | |
}, | |
"tags": [ | |
"source_News", "topic_Finance" | |
], | |
"inclusionDomains": [ | |
"path:/news/articles/" | |
], | |
"delay": 1, | |
"interval": 172800000, | |
"maxDocCount": 500, | |
"maxDocSize": -1 | |
} | |
------------------------------------------------- | |
4. Deep Web harvest for search engine sources (USE SPARINGLY: it's easy to get blocked using these sources, will return 0 results if blocked) | |
{ | |
"id": "string", | |
"harvestEventType": "DEEP", | |
"scheduleType": "ONCE", | |
"name": "deep_SearchEngine", | |
"deepHarvestParameters": { | |
"deepQueries": [ | |
"\"theresa may\" AND brexit", | |
"p2p OR \"peer to peer\" OR \"sharing economy\"" | |
], | |
"categoryIds": [ | |
2670 | |
] | |
}, | |
"filterQuery": "brexit OR \"theresa may\" OR p2p OR \"peer to peer\" OR \"sharing economy\" OR (economy AND sharing)", | |
"tags": [ | |
"source_General", "topic_Finance" | |
], | |
"maxDocCount": 1000, | |
"maxDocSize": -1 | |
} | |
------------------------------------------------- | |
5. Deep Web harvest for multiple categories | |
{ | |
"id": "string", | |
"harvestEventType": "DEEP", | |
"scheduleType": "ONCE", | |
"name": "deep_SearchEngine", | |
"deepHarvestParameters": { | |
"deepQueries": [ | |
"\"theresa may\" AND brexit", | |
"peer AND sharing AND economy" | |
], | |
"categoryIds": [ | |
2844, 2845, 2846, 2851, 2895 | |
] | |
}, | |
"filterQuery": "brexit OR \"theresa may\" OR peer OR sharing OR economy", | |
"tags": [ | |
"source_News", "topic_Finance" | |
], | |
"maxDocCount": 500, | |
"maxDocSize": -1 | |
} | |
------------------------------------------------- | |
6. RSS harvest to monitor a website daily for new documents | |
{ | |
"id": "string", | |
"harvestEventType": "RSS", | |
"scheduleType": "RECURRING", | |
"name": "rss_monitor", | |
"delay": 0, | |
"interval": 86400000, | |
"maxDocCount": 100, | |
"maxDocSize": -1, | |
"rssHarvestParameters": { | |
"levelsInternal": 1, | |
"levelsExternal": 0, | |
"initialUrls": [ | |
"http://rss.cnn.com/rss/cnn_topstories.rss", | |
"http://www.npr.org/rss/rss.php?id=1019" | |
] | |
}, | |
"xpaths": [ | |
"dateNode:((//*[contains(local-name(), 'time')]|//*[contains(local-name(), 'Time')]|//*[contains(local-name(), 'date')]|//*[contains(local-name(), 'Date')])/text()[normalize-space()])[1]", | |
"dateAttrib:((//*[(contains(@itemprop|@class|@id, 'Date') or contains(@itemprop|@class|@id, 'Time') or contains(@itemprop|@class|@id, 'Posted') or contains(@itemprop|@class|@id, 'Publish') or contains(@itemprop|@class|@id, 'date') or contains(@itemprop|@class|@id, 'time') or contains(@itemprop|@class|@id, 'posted') or contains(@itemprop|@class|@id, 'publish')) and not(contains(@itemprop|@class|@id, 'Timeline')) and not(contains(@itemprop|@class|@id, 'timeline'))])/text()[normalize-space()])[1] ", | |
"dateMeta:(//meta[contains(@class|@itemprop|@id, 'date') or contains(@class|@itemprop|@id, 'time') or contains(@class|@itemprop|@id, 'Date') or contains(@class|@itemprop|@id, 'Time')])[1]/@content ", | |
"dateText:(//*[text()[contains(.,'201') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]]|//*[text()[contains(.,'200') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]])[position() < 4]" | |
], | |
"tags": [ | |
"source_News", "topic_Manufacturing" | |
] | |
} | |
------------------------------------------------- | |
7. XPATH expressions - use these xpaths to manipulate which text is harvested from a web page | |
OVERWRITE DEFAULT BOILERPIPE TEXT AND CAPTURE ALL TEXT: | |
"textArticle:overwrite=true", | |
"textArticle://body//*[not(self::script or self::style)]/text()[normalize-space(.)]" | |
OVERWRITE DEFAULT BOILERPIPE TEXT AND CAPTURE CUSTOM XPATH: | |
"textArticle:overwrite=true", | |
"textArticle:[FILL W/ XPATH TO DICTATE WHICH TEXT IS HARVESTED, USE PIPE CHARACTER TO 'OR' XPATH EXPRESSIONS]" | |
SEARCH NODES FOR PUBLISH DATE: | |
"dateNode:((//*[contains(local-name(), 'time')]|//*[contains(local-name(), 'Time')]|//*[contains(local-name(), 'date')]|//*[contains(local-name(), 'Date')])/text()[normalize-space()])[1]", | |
"dateAttrib:((//*[(contains(@itemprop|@class|@id, 'Date') or contains(@itemprop|@class|@id, 'Time') or contains(@itemprop|@class|@id, 'Posted') or contains(@itemprop|@class|@id, 'Publish') or contains(@itemprop|@class|@id, 'date') or contains(@itemprop|@class|@id, 'time') or contains(@itemprop|@class|@id, 'posted') or contains(@itemprop|@class|@id, 'publish')) and not(contains(@itemprop|@class|@id, 'Timeline')) and not(contains(@itemprop|@class|@id, 'timeline'))])/text()[normalize-space()])[1] ", | |
"dateMeta:(//meta[contains(@class|@itemprop|@id, 'date') or contains(@class|@itemprop|@id, 'time') or contains(@class|@itemprop|@id, 'Date') or contains(@class|@itemprop|@id, 'Time')])[1]/@content ", | |
"dateText:(//*[text()[contains(.,'201') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]]|//*[text()[contains(.,'200') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]])[position() < 4]" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment