coreyhermanson · September 29, 2016 18:55
diff --git a/brightplanet_harvestAPI_examples.JSON b/brightplanet_harvestAPI_examples.JSON
 EXAMPLE JSON PAYLOADS FOR BRIGHTPLANET HARVEST API
 =================================================
 1. Website harvest - scraping search results pages
 2. Website harvest - harvesting a list of URLs, includes Xpath overwrite and Date-finding Xpath
 3. Website harvest - scheduled harvest to monitor new documents
 4. Deep Web harvest - query search engines (USE SPARINGLY - rate limits)
 5. Deep Web harvest - query sources from multiple source groups
 6. RSS harvest - monitor new documents daily using RSS feeds, includes Xpath overwrite and Date-finding Xpath
 7. XPATH expressions - use these xpaths to manipulate which text is harvested from a web page
 =================================================

 1. Website Harvest at depth=1, skipLevels=1 w/ URL path filter and filterQuery (scraping search results pages)

 {
  "id": "string",
  "harvestEventType": "SITE",
  "scheduleType": "ONCE",
  "name": "website_searchResults_depth1",
  "siteHarvestParameters": {
    "startingUrls": [
      "http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer",
      "http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=11",
      "http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=21",
      "http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=31",
      "http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=41",
      "http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=51",
      "http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=61",
      "http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=71",
      "http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=81",
      "http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=91"
    ],
    "siteMaxDepth": 1,
    "siteMaxDepthExternal": 0,
    "skipLevels": 1
  },
  "filterQuery": "cancer OR lymphoma OR (breast AND diagnose)",
  "tags": [
    "source_News", "topic_Science"
  ],
  "inclusionDomains": [
    "path:/pub_releases/"
  ],
  "maxDocCount": 1000,
  "maxDocSize": -1
 }
 
 -------------------------------------------------
 
 2. Website Harvest at depth=0 w/ xpath overwrite and filterQuery
 
 {
 "id": "string",
 "harvestEventType": "SITE",
 "scheduleType": "ONCE",
 "name": "website_XpathOverwrite",
 "siteHarvestParameters": {
  "startingUrls": [
 	"http://www.breitbart.com/2016-presidential-race/2016/07/10/chiat-hitlers-rise-power-explains-trump/"
 ],
  "siteMaxDepth": 0,
  "siteMaxDepthExternal": 0,
  "skipLevels": 0
 },
 "filterQuery": "trump",
 "xpaths": [
 "textArticle:overwrite=true",
 "title://title",
 "textArticle://div[@class='entry-content']",
 "dateNode:((//*[contains(local-name(), 'time')]|//*[contains(local-name(), 'Time')]|//*[contains(local-name(), 'date')]|//*[contains(local-name(), 'Date')])/text()[normalize-space()])[1]",
 "dateAttrib:((//*[(contains(@itemprop|@class|@id, 'Date') or contains(@itemprop|@class|@id, 'Time') or contains(@itemprop|@class|@id, 'Posted') or contains(@itemprop|@class|@id, 'Publish') or contains(@itemprop|@class|@id, 'date') or contains(@itemprop|@class|@id, 'time') or contains(@itemprop|@class|@id, 'posted') or contains(@itemprop|@class|@id, 'publish')) and not(contains(@itemprop|@class|@id, 'Timeline')) and not(contains(@itemprop|@class|@id, 'timeline'))])/text()[normalize-space()])[1] ",
 "dateMeta:(//meta[contains(@class|@itemprop|@id, 'date') or contains(@class|@itemprop|@id, 'time') or contains(@class|@itemprop|@id, 'Date') or contains(@class|@itemprop|@id, 'Time')])[1]/@content ",
 "dateText:(//*[text()[contains(.,'201') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]]|//*[text()[contains(.,'200') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]])[position() < 4]"
 ],
 "tags": [
 "source_News", "topic_Finance"
 ],
 "maxDocCount": 25,
 "maxDocSize": -1
 }
 
  -------------------------------------------------
  
 3. Website harvest scheduled for every two days to harvest new documents
 
 {
  "id": "string",
  "harvestEventType": "SITE",
  "scheduleType": "RECURRING",
  "name": "website_scheduled",
  "siteHarvestParameters": {
    "startingUrls": [
      "http://www.bloomberg.com/technology"
    ],
    "siteMaxDepth": 1,
    "siteMaxDepthExternal": 0,
    "skipLevels": 1
  },
  "tags": [
    "source_News", "topic_Finance"
  ],
  "inclusionDomains": [
    "path:/news/articles/"
  ],
  "delay": 1,  
  "interval": 172800000,
  "maxDocCount": 500,
  "maxDocSize": -1
 }
 
  -------------------------------------------------
  
 4. Deep Web harvest for search engine sources (USE SPARINGLY: it's easy to get blocked using these sources, will return 0 results if blocked)
 
 {
  "id": "string",
  "harvestEventType": "DEEP",
  "scheduleType": "ONCE",
  "name": "deep_SearchEngine",
  "deepHarvestParameters": {
    "deepQueries": [
      "\"theresa may\" AND brexit",
      "p2p OR \"peer to peer\" OR \"sharing economy\""
    ],
    "categoryIds": [
      2670
    ]
  },
  "filterQuery": "brexit OR \"theresa may\" OR p2p OR \"peer to peer\" OR \"sharing economy\" OR (economy AND sharing)",
  "tags": [
    "source_General", "topic_Finance"
  ],
  "maxDocCount": 1000,
  "maxDocSize": -1
 }
 
  -------------------------------------------------
  
 5. Deep Web harvest for multiple categories
 {
  "id": "string",
  "harvestEventType": "DEEP",
  "scheduleType": "ONCE",
  "name": "deep_SearchEngine",
  "deepHarvestParameters": {
    "deepQueries": [
      "\"theresa may\" AND brexit",
      "peer AND sharing AND economy"
    ],
    "categoryIds": [
      2844, 2845, 2846, 2851, 2895
    ]
  },
  "filterQuery": "brexit OR \"theresa may\" OR peer OR sharing OR economy",
  "tags": [
    "source_News", "topic_Finance"
  ],
  "maxDocCount": 500,
  "maxDocSize": -1
 }

  -------------------------------------------------
  
 6. RSS harvest to monitor a website daily for new documents
 
 {
 "id": "string",
 "harvestEventType": "RSS",
 "scheduleType": "RECURRING",
 "name": "rss_monitor",
 "delay": 0,
 "interval": 86400000,
 "maxDocCount": 100,
 "maxDocSize": -1,
 "rssHarvestParameters": {
  "levelsInternal": 1,
  "levelsExternal": 0,
  "initialUrls": [
 	 "http://rss.cnn.com/rss/cnn_topstories.rss",
 	 "http://www.npr.org/rss/rss.php?id=1019"
  ]
 },
 "xpaths": [
 "dateNode:((//*[contains(local-name(), 'time')]|//*[contains(local-name(), 'Time')]|//*[contains(local-name(), 'date')]|//*[contains(local-name(), 'Date')])/text()[normalize-space()])[1]",
 "dateAttrib:((//*[(contains(@itemprop|@class|@id, 'Date') or contains(@itemprop|@class|@id, 'Time') or contains(@itemprop|@class|@id, 'Posted') or contains(@itemprop|@class|@id, 'Publish') or contains(@itemprop|@class|@id, 'date') or contains(@itemprop|@class|@id, 'time') or contains(@itemprop|@class|@id, 'posted') or contains(@itemprop|@class|@id, 'publish')) and not(contains(@itemprop|@class|@id, 'Timeline')) and not(contains(@itemprop|@class|@id, 'timeline'))])/text()[normalize-space()])[1] ",
 "dateMeta:(//meta[contains(@class|@itemprop|@id, 'date') or contains(@class|@itemprop|@id, 'time') or contains(@class|@itemprop|@id, 'Date') or contains(@class|@itemprop|@id, 'Time')])[1]/@content ",
 "dateText:(//*[text()[contains(.,'201') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]]|//*[text()[contains(.,'200') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]])[position() < 4]"
 ],
 "tags": [
  "source_News", "topic_Manufacturing"
  ]
 }
                                                  
 -------------------------------------------------
  
 7. XPATH expressions - use these xpaths to manipulate which text is harvested from a web page
                                                
 OVERWRITE DEFAULT BOILERPIPE TEXT AND CAPTURE ALL TEXT:
 "textArticle:overwrite=true",
 "textArticle://body//*[not(self::script or self::style)]/text()[normalize-space(.)]"

 OVERWRITE DEFAULT BOILERPIPE TEXT AND CAPTURE CUSTOM XPATH:
 "textArticle:overwrite=true",
 "textArticle:[FILL W/ XPATH TO DICTATE WHICH TEXT IS HARVESTED, USE PIPE CHARACTER TO 'OR' XPATH EXPRESSIONS]"
                                                
 SEARCH NODES FOR PUBLISH DATE:
 "dateNode:((//*[contains(local-name(), 'time')]|//*[contains(local-name(), 'Time')]|//*[contains(local-name(), 'date')]|//*[contains(local-name(), 'Date')])/text()[normalize-space()])[1]",
 "dateAttrib:((//*[(contains(@itemprop|@class|@id, 'Date') or contains(@itemprop|@class|@id, 'Time') or contains(@itemprop|@class|@id, 'Posted') or contains(@itemprop|@class|@id, 'Publish') or contains(@itemprop|@class|@id, 'date') or contains(@itemprop|@class|@id, 'time') or contains(@itemprop|@class|@id, 'posted') or contains(@itemprop|@class|@id, 'publish')) and not(contains(@itemprop|@class|@id, 'Timeline')) and not(contains(@itemprop|@class|@id, 'timeline'))])/text()[normalize-space()])[1] ",
 "dateMeta:(//meta[contains(@class|@itemprop|@id, 'date') or contains(@class|@itemprop|@id, 'time') or contains(@class|@itemprop|@id, 'Date') or contains(@class|@itemprop|@id, 'Time')])[1]/@content ",
 "dateText:(//*[text()[contains(.,'201') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]]|//*[text()[contains(.,'200') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]])[position() < 4]"
	EXAMPLE JSON PAYLOADS FOR BRIGHTPLANET HARVEST API
	=================================================
	1. Website harvest - scraping search results pages
	2. Website harvest - harvesting a list of URLs, includes Xpath overwrite and Date-finding Xpath
	3. Website harvest - scheduled harvest to monitor new documents
	4. Deep Web harvest - query search engines (USE SPARINGLY - rate limits)
	5. Deep Web harvest - query sources from multiple source groups
	6. RSS harvest - monitor new documents daily using RSS feeds, includes Xpath overwrite and Date-finding Xpath
	7. XPATH expressions - use these xpaths to manipulate which text is harvested from a web page
	=================================================

	1. Website Harvest at depth=1, skipLevels=1 w/ URL path filter and filterQuery (scraping search results pages)

	{
	"id": "string",
	"harvestEventType": "SITE",
	"scheduleType": "ONCE",
	"name": "website_searchResults_depth1",
	"siteHarvestParameters": {
	"startingUrls": [
	"http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer",
	"http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=11",
	"http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=21",
	"http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=31",
	"http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=41",
	"http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=51",
	"http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=61",
	"http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=71",
	"http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=81",
	"http://search.eurekalert.org/e3/query.html?charset=iso-8859-1&pw=100.101%25&qt=cancer&st=91"
	],
	"siteMaxDepth": 1,
	"siteMaxDepthExternal": 0,
	"skipLevels": 1
	},
	"filterQuery": "cancer OR lymphoma OR (breast AND diagnose)",
	"tags": [
	"source_News", "topic_Science"
	],
	"inclusionDomains": [
	"path:/pub_releases/"
	],
	"maxDocCount": 1000,
	"maxDocSize": -1
	}

	-------------------------------------------------

	2. Website Harvest at depth=0 w/ xpath overwrite and filterQuery

	{
	"id": "string",
	"harvestEventType": "SITE",
	"scheduleType": "ONCE",
	"name": "website_XpathOverwrite",
	"siteHarvestParameters": {
	"startingUrls": [
	"http://www.breitbart.com/2016-presidential-race/2016/07/10/chiat-hitlers-rise-power-explains-trump/"
	],
	"siteMaxDepth": 0,
	"siteMaxDepthExternal": 0,
	"skipLevels": 0
	},
	"filterQuery": "trump",
	"xpaths": [
	"textArticle:overwrite=true",
	"title://title",
	"textArticle://div[@class='entry-content']",
	"dateNode:((//[contains(local-name(), 'time')]\|//[contains(local-name(), 'Time')]\|//[contains(local-name(), 'date')]\|//[contains(local-name(), 'Date')])/text()[normalize-space()])[1]",
	"dateAttrib:((//*[(contains(@itemprop\|@class\|@id, 'Date') or contains(@itemprop\|@class\|@id, 'Time') or contains(@itemprop\|@class\|@id, 'Posted') or contains(@itemprop\|@class\|@id, 'Publish') or contains(@itemprop\|@class\|@id, 'date') or contains(@itemprop\|@class\|@id, 'time') or contains(@itemprop\|@class\|@id, 'posted') or contains(@itemprop\|@class\|@id, 'publish')) and not(contains(@itemprop\|@class\|@id, 'Timeline')) and not(contains(@itemprop\|@class\|@id, 'timeline'))])/text()[normalize-space()])[1] ",
	"dateMeta:(//meta[contains(@class\|@itemprop\|@id, 'date') or contains(@class\|@itemprop\|@id, 'time') or contains(@class\|@itemprop\|@id, 'Date') or contains(@class\|@itemprop\|@id, 'Time')])[1]/@content ",
	"dateText:(//[text()[contains(.,'201') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]]\|//[text()[contains(.,'200') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]])[position() < 4]"
	],
	"tags": [
	"source_News", "topic_Finance"
	],
	"maxDocCount": 25,
	"maxDocSize": -1
	}

	-------------------------------------------------

	3. Website harvest scheduled for every two days to harvest new documents

	{
	"id": "string",
	"harvestEventType": "SITE",
	"scheduleType": "RECURRING",
	"name": "website_scheduled",
	"siteHarvestParameters": {
	"startingUrls": [
	"http://www.bloomberg.com/technology"
	],
	"siteMaxDepth": 1,
	"siteMaxDepthExternal": 0,
	"skipLevels": 1
	},
	"tags": [
	"source_News", "topic_Finance"
	],
	"inclusionDomains": [
	"path:/news/articles/"
	],
	"delay": 1,
	"interval": 172800000,
	"maxDocCount": 500,
	"maxDocSize": -1
	}

	-------------------------------------------------

	4. Deep Web harvest for search engine sources (USE SPARINGLY: it's easy to get blocked using these sources, will return 0 results if blocked)

	{
	"id": "string",
	"harvestEventType": "DEEP",
	"scheduleType": "ONCE",
	"name": "deep_SearchEngine",
	"deepHarvestParameters": {
	"deepQueries": [
	"\"theresa may\" AND brexit",
	"p2p OR \"peer to peer\" OR \"sharing economy\""
	],
	"categoryIds": [
	2670
	]
	},
	"filterQuery": "brexit OR \"theresa may\" OR p2p OR \"peer to peer\" OR \"sharing economy\" OR (economy AND sharing)",
	"tags": [
	"source_General", "topic_Finance"
	],
	"maxDocCount": 1000,
	"maxDocSize": -1
	}

	-------------------------------------------------

	5. Deep Web harvest for multiple categories
	{
	"id": "string",
	"harvestEventType": "DEEP",
	"scheduleType": "ONCE",
	"name": "deep_SearchEngine",
	"deepHarvestParameters": {
	"deepQueries": [
	"\"theresa may\" AND brexit",
	"peer AND sharing AND economy"
	],
	"categoryIds": [
	2844, 2845, 2846, 2851, 2895
	]
	},
	"filterQuery": "brexit OR \"theresa may\" OR peer OR sharing OR economy",
	"tags": [
	"source_News", "topic_Finance"
	],
	"maxDocCount": 500,
	"maxDocSize": -1
	}

	-------------------------------------------------

	6. RSS harvest to monitor a website daily for new documents

	{
	"id": "string",
	"harvestEventType": "RSS",
	"scheduleType": "RECURRING",
	"name": "rss_monitor",
	"delay": 0,
	"interval": 86400000,
	"maxDocCount": 100,
	"maxDocSize": -1,
	"rssHarvestParameters": {
	"levelsInternal": 1,
	"levelsExternal": 0,
	"initialUrls": [
	"http://rss.cnn.com/rss/cnn_topstories.rss",
	"http://www.npr.org/rss/rss.php?id=1019"
	]
	},
	"xpaths": [
	"dateNode:((//[contains(local-name(), 'time')]\|//[contains(local-name(), 'Time')]\|//[contains(local-name(), 'date')]\|//[contains(local-name(), 'Date')])/text()[normalize-space()])[1]",
	"dateAttrib:((//*[(contains(@itemprop\|@class\|@id, 'Date') or contains(@itemprop\|@class\|@id, 'Time') or contains(@itemprop\|@class\|@id, 'Posted') or contains(@itemprop\|@class\|@id, 'Publish') or contains(@itemprop\|@class\|@id, 'date') or contains(@itemprop\|@class\|@id, 'time') or contains(@itemprop\|@class\|@id, 'posted') or contains(@itemprop\|@class\|@id, 'publish')) and not(contains(@itemprop\|@class\|@id, 'Timeline')) and not(contains(@itemprop\|@class\|@id, 'timeline'))])/text()[normalize-space()])[1] ",
	"dateMeta:(//meta[contains(@class\|@itemprop\|@id, 'date') or contains(@class\|@itemprop\|@id, 'time') or contains(@class\|@itemprop\|@id, 'Date') or contains(@class\|@itemprop\|@id, 'Time')])[1]/@content ",
	"dateText:(//[text()[contains(.,'201') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]]\|//[text()[contains(.,'200') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]])[position() < 4]"
	],
	"tags": [
	"source_News", "topic_Manufacturing"
	]
	}

	-------------------------------------------------

	7. XPATH expressions - use these xpaths to manipulate which text is harvested from a web page

	OVERWRITE DEFAULT BOILERPIPE TEXT AND CAPTURE ALL TEXT:
	"textArticle:overwrite=true",
	"textArticle://body//*[not(self::script or self::style)]/text()[normalize-space(.)]"

	OVERWRITE DEFAULT BOILERPIPE TEXT AND CAPTURE CUSTOM XPATH:
	"textArticle:overwrite=true",
	"textArticle:[FILL W/ XPATH TO DICTATE WHICH TEXT IS HARVESTED, USE PIPE CHARACTER TO 'OR' XPATH EXPRESSIONS]"

	SEARCH NODES FOR PUBLISH DATE:
	"dateNode:((//[contains(local-name(), 'time')]\|//[contains(local-name(), 'Time')]\|//[contains(local-name(), 'date')]\|//[contains(local-name(), 'Date')])/text()[normalize-space()])[1]",
	"dateAttrib:((//*[(contains(@itemprop\|@class\|@id, 'Date') or contains(@itemprop\|@class\|@id, 'Time') or contains(@itemprop\|@class\|@id, 'Posted') or contains(@itemprop\|@class\|@id, 'Publish') or contains(@itemprop\|@class\|@id, 'date') or contains(@itemprop\|@class\|@id, 'time') or contains(@itemprop\|@class\|@id, 'posted') or contains(@itemprop\|@class\|@id, 'publish')) and not(contains(@itemprop\|@class\|@id, 'Timeline')) and not(contains(@itemprop\|@class\|@id, 'timeline'))])/text()[normalize-space()])[1] ",
	"dateMeta:(//meta[contains(@class\|@itemprop\|@id, 'date') or contains(@class\|@itemprop\|@id, 'time') or contains(@class\|@itemprop\|@id, 'Date') or contains(@class\|@itemprop\|@id, 'Time')])[1]/@content ",
	"dateText:(//[text()[contains(.,'201') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]]\|//[text()[contains(.,'200') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]])[position() < 4]"