Corey Hermanson coreyhermanson

Python CodeBook

Unscheduled: Deep Web harvest will execute immediately (no delay parameter), and run once (scheduleType="ONCE" and no interval parameter)

 {
  "id": "string",
  "harvestEventType": "DEEP",

	#!/usr/bin/env python

	import pyperclip
	import sys

	"""
	Example output for below configuration, pasted to your Clipboard:
	http://www.exampledomain.com/all-stories/?page=1
	http://www.exampledomain.com/all-stories/?page=2
	http://www.exampledomain.com/all-stories/?page=3

	#!/usr/bin/env python

	import re


	input_file = 'infile.txt' # enter full file path, precede string with 'r' (r'PATH') if using Windows
	output_file = 'outfile.txt' # enter full file path, precede string with 'r' (r'PATH') if using Windows
	delete_counter = 0

	# list of individual regex, which will be combined into a single regex in the next step

	#!/usr/bin/env python

	import pyperclip

	example_list = ["Line 1", "Line 2", "Line 3", "forever and ever"]

	def list_to_clipboard(output_list):
	""" Check if len(list) > 0, then copy to clipboard """
	if len(output_list) > 0:
	pyperclip.copy('\n'.join(output_list))

	EXAMPLE JSON PAYLOADS FOR BRIGHTPLANET HARVEST API
	=================================================
	1. Website harvest - scraping search results pages
	2. Website harvest - harvesting a list of URLs, includes Xpath overwrite and Date-finding Xpath
	3. Website harvest - scheduled harvest to monitor new documents
	4. Deep Web harvest - query search engines (USE SPARINGLY - rate limits)
	5. Deep Web harvest - query sources from multiple source groups
	6. RSS harvest - monitor new documents daily using RSS feeds, includes Xpath overwrite and Date-finding Xpath
	7. XPATH expressions - use these xpaths to manipulate which text is harvested from a web page
	=================================================

	import requests

	infile = r'C:\Users\Account\PythonFiles\generic_infile.txt' # full path to any file inside quotes

	# Harvest Event Variables
	api_key = "123abc" # STRING - 1 API key per Harvest API schema
	searchable_items_per_event = 100 # INT - max queries OR max screenNames
	name_of_event = "NewYork_Politics" # STRING - Program will pre-pend "TW_" and add "_#" to the end
	filterQuery = None # STRING - ex: "nuclear AND (war OR energy)"
	event_tags = ["source_Politics", "New York"] # LIST

	import requests
	import csv

	input_file = r'YOUR_FULL_FILEPATH_HERE'
	var_scheduled = "RECURRING"
	var_initial_delay = 1.0 # float
	var_time_between_scheduled_events = 12.0 # float
	var_max_depth = 1
	var_depth_external = 0
	var_max_docsize = -1