Corey Hermanson coreyhermanson

BrightPlanet Harvest API: Deep Web harvest examples

Unscheduled: Deep Web harvest will execute immediately (no delay parameter), and run once (scheduleType="ONCE" and no interval parameter)

 {
  "id": "string",
  "harvestEventType": "DEEP",

	#!/usr/bin/env python

	"""
	This script returns documents from the BrightPlanet REST API. Input is a text file with a list of queries.
	Output is a CSV file with your desired fields for each document. Default time period is everything until present.
	Requires 'requests' module. To install via cmd, enter: python -m pip install requests
	"""

	import requests
	import csv

	import requests
	import csv

	input_file = r'YOUR_FULL_FILEPATH_HERE'
	var_scheduled = "RECURRING"
	var_initial_delay = 1.0 # float
	var_time_between_scheduled_events = 12.0 # float
	var_max_depth = 1
	var_depth_external = 0
	var_max_docsize = -1

	#!/usr/bin/env python

	import pyperclip
	import re
	from list_clipboard_manipulations import list_to_clipboard

	delete_counter = 0
	good_list = list()
	sort_alpha = False

	import requests

	infile = r'C:\Users\Account\PythonFiles\generic_infile.txt' # full path to any file inside quotes

	# Harvest Event Variables
	api_key = "123abc" # STRING - 1 API key per Harvest API schema
	searchable_items_per_event = 100 # INT - max queries OR max screenNames
	name_of_event = "NewYork_Politics" # STRING - Program will pre-pend "TW_" and add "_#" to the end
	filterQuery = None # STRING - ex: "nuclear AND (war OR energy)"
	event_tags = ["source_Politics", "New York"] # LIST

	EXAMPLE JSON PAYLOADS FOR BRIGHTPLANET HARVEST API
	=================================================
	1. Website harvest - scraping search results pages
	2. Website harvest - harvesting a list of URLs, includes Xpath overwrite and Date-finding Xpath
	3. Website harvest - scheduled harvest to monitor new documents
	4. Deep Web harvest - query search engines (USE SPARINGLY - rate limits)
	5. Deep Web harvest - query sources from multiple source groups
	6. RSS harvest - monitor new documents daily using RSS feeds, includes Xpath overwrite and Date-finding Xpath
	7. XPATH expressions - use these xpaths to manipulate which text is harvested from a web page
	=================================================

	#!/usr/bin/env python

	import pyperclip

	example_list = ["Line 1", "Line 2", "Line 3", "forever and ever"]

	def list_to_clipboard(output_list):
	""" Check if len(list) > 0, then copy to clipboard """
	if len(output_list) > 0:
	pyperclip.copy('\n'.join(output_list))

	#!/usr/bin/env python

	import re


	input_file = 'infile.txt' # enter full file path, precede string with 'r' (r'PATH') if using Windows
	output_file = 'outfile.txt' # enter full file path, precede string with 'r' (r'PATH') if using Windows
	delete_counter = 0

	# list of individual regex, which will be combined into a single regex in the next step