- Unscheduled: Deep Web harvest will execute immediately (no delay parameter), and run once (scheduleType="ONCE" and no interval parameter)
{
"id": "string",
"harvestEventType": "DEEP",
#!/usr/bin/env python | |
""" | |
This script returns documents from the BrightPlanet REST API. Input is a text file with a list of queries. | |
Output is a CSV file with your desired fields for each document. Default time period is everything until present. | |
Requires 'requests' module. To install via cmd, enter: python -m pip install requests | |
""" | |
import requests | |
import csv |
import requests | |
import csv | |
input_file = r'YOUR_FULL_FILEPATH_HERE' | |
var_scheduled = "RECURRING" | |
var_initial_delay = 1.0 # float | |
var_time_between_scheduled_events = 12.0 # float | |
var_max_depth = 1 | |
var_depth_external = 0 | |
var_max_docsize = -1 |
#!/usr/bin/env python | |
import pyperclip | |
import re | |
from list_clipboard_manipulations import list_to_clipboard | |
delete_counter = 0 | |
good_list = list() | |
sort_alpha = False |
import requests | |
infile = r'C:\Users\Account\PythonFiles\generic_infile.txt' # full path to any file inside quotes | |
# Harvest Event Variables | |
api_key = "123abc" # STRING - 1 API key per Harvest API schema | |
searchable_items_per_event = 100 # INT - max queries OR max screenNames | |
name_of_event = "NewYork_Politics" # STRING - Program will pre-pend "TW_" and add "_#" to the end | |
filterQuery = None # STRING - ex: "nuclear AND (war OR energy)" | |
event_tags = ["source_Politics", "New York"] # LIST |
EXAMPLE JSON PAYLOADS FOR BRIGHTPLANET HARVEST API | |
================================================= | |
1. Website harvest - scraping search results pages | |
2. Website harvest - harvesting a list of URLs, includes Xpath overwrite and Date-finding Xpath | |
3. Website harvest - scheduled harvest to monitor new documents | |
4. Deep Web harvest - query search engines (USE SPARINGLY - rate limits) | |
5. Deep Web harvest - query sources from multiple source groups | |
6. RSS harvest - monitor new documents daily using RSS feeds, includes Xpath overwrite and Date-finding Xpath | |
7. XPATH expressions - use these xpaths to manipulate which text is harvested from a web page | |
================================================= |
#!/usr/bin/env python | |
import pyperclip | |
example_list = ["Line 1", "Line 2", "Line 3", "forever and ever"] | |
def list_to_clipboard(output_list): | |
""" Check if len(list) > 0, then copy to clipboard """ | |
if len(output_list) > 0: | |
pyperclip.copy('\n'.join(output_list)) |
#!/usr/bin/env python | |
import re | |
input_file = 'infile.txt' # enter full file path, precede string with 'r' (r'PATH') if using Windows | |
output_file = 'outfile.txt' # enter full file path, precede string with 'r' (r'PATH') if using Windows | |
delete_counter = 0 | |
# list of individual regex, which will be combined into a single regex in the next step |