jronallo · February 16, 2017 00:53
diff --git a/NC-HB2-ids.sh b/NC-HB2-ids.sh
 # This script requires the jq utility
 # https://stedolan.github.io/jq/
 # Datasets created with twarc
 # https://github.com/DocNow/twarc

 mkdir -p NCHB2-ids
 rm NCHB2-ids/NCHB2*
 touch NCHB2-ids/NCHB2-ids-with-dupes.txt

 # Create more relevant subset of "North Carlina" search
 jq -c '{id, text}' North_Carolina/*.json | grep -Eih 'text.*(hb2|bill|bathroom|KeepNCFair)' | jq '.id' >> NCHB2-ids/NCHB2-ids-with-dupes.txt

 # Get the IDs for all of the Tweets in the other searches
 jq '.id' `find -L . -name "*.json" -not -path "./North_Carolina/*"` >> NCHB2-ids/NCHB2-ids-with-dupes.txt


 pushd NCHB2-ids
 # Sort IDs and remove duplicates
 sort -u NCHB2-ids-with-dupes.txt > NCHB2-ids.txt

 # Break into files with 50,000 Tweet IDs each
 split -l 50000 NCHB2-ids.txt NCHB2-ids.
 popd
	# This script requires the jq utility
	# https://stedolan.github.io/jq/
	# Datasets created with twarc
	# https://github.com/DocNow/twarc

	mkdir -p NCHB2-ids
	rm NCHB2-ids/NCHB2*
	touch NCHB2-ids/NCHB2-ids-with-dupes.txt

	# Create more relevant subset of "North Carlina" search
	jq -c '{id, text}' North_Carolina/.json \| grep -Eih 'text.(hb2\|bill\|bathroom\|KeepNCFair)' \| jq '.id' >> NCHB2-ids/NCHB2-ids-with-dupes.txt

	# Get the IDs for all of the Tweets in the other searches
	jq '.id' `find -L . -name ".json" -not -path "./North_Carolina/"` >> NCHB2-ids/NCHB2-ids-with-dupes.txt


	pushd NCHB2-ids
	# Sort IDs and remove duplicates
	sort -u NCHB2-ids-with-dupes.txt > NCHB2-ids.txt

	# Break into files with 50,000 Tweet IDs each
	split -l 50000 NCHB2-ids.txt NCHB2-ids.
	popd