Skip to content

Instantly share code, notes, and snippets.

@zackster
Created August 29, 2016 16:48
Show Gist options
  • Save zackster/d12fb0fee0a54e52ab89bd42ad4449a5 to your computer and use it in GitHub Desktop.
Save zackster/d12fb0fee0a54e52ab89bd42ad4449a5 to your computer and use it in GitHub Desktop.
Script for processing SEC 8-K filings en masse
require 'nokogiri'
require 'pg'
require 'json'
require 'parallel'
require 'timeout'
def score( array )
array.each_with_object(Hash.new(0)){|key,hash| hash[key] += 1}
end
pg_client = PGconn.connect(dbname: ENV['postgres_db'], user: ENV['postgres_user'], password: ENV['postgres_user_pw'])
results = pg_client.query("select filename from form_8k_ngrams where n2_grams is null and random()<0.1").to_a
pg_client.close
Parallel.each_with_index(results) do |result_row, idx|
original_filename = result_row['filename']
filename = original_filename.split('/').last
filename_glob = filename.split('.')[0..-2].join('')
expected_file_path = "/path/to/forms/8K/#{filename}"
if File.exist?(expected_file_path) && File.size?(expected_file_path) && File.size?(expected_file_path)> 0
path_to_actual_file = expected_file_path
puts 'file exists'
else
path_to_actual_file = Dir.glob("/path/to/forms/8K/#{filename_glob}*").first
puts 'could not find file'
end
next if path_to_actual_file.nil?
begin
Timeout::timeout(30) do
doc = open(path_to_actual_file).read
# cik = doc.match(/CENTRAL INDEX KEY.+/)[0].split(/\s+/).last
begin
ad_match = doc.match('ACCEPTANCE-DATETIME>(.+)')
rescue
next
end
next if ad_match.nil?
acceptance_datetime = ad_match[1]
no_images = doc.split('<TEXT>').reject{|fragment|fragment.include?'begin 644'}[1..-1].join(' ')
clean_text = Nokogiri::HTML(no_images).text.split('').map{|x| if x.ord==160; ' '; else; x; end }.join('').gsub(/\s+/mx,' ').gsub(/“|”|’/,"'").downcase.gsub(/[:;.]/, '')
stopwords = ["i","me","my","myself","we","our","ours","ourselves","you","your","yours","yourself","yourselves","he","him","his","himself","she","her","hers","herself","it","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","above","below","to","from","up","down","in","out","on","off","over","under","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too","very","s","t","can","will","just","don","should","now","d","ll","m","o","re","ve","y","ain","aren","couldn","didn","doesn","hadn","hasn","haven","isn","ma","mightn","mustn","needn","shan","shouldn","wasn","weren","won","wouldn"]
no_stopwords_text = clean_text.split(' ').reject{|word| stopwords.include?(word) }.join(' ')
n_grams = {}
[2,3,4].each do |n|
n_grams[n] = score(no_stopwords_text.split(' ').each_cons(n).to_a).sort_by {|_key, value| value}.to_h.to_json
end
pg_client = PGconn.connect(dbname: ENV['postgres_db'], user: ENV['postgres_user'], password: ENV['postgres_user_pw'])
pg_client.prepare("f8k_ngram_statement#{idx}", "update form_8k_ngrams set n2_grams=$1,n3_grams=$2,n4_grams=$3,acceptance_datetime=$4 where filename=$5")
pg_client.exec_prepared("f8k_ngram_statement#{idx}", [ n_grams[2],n_grams[3],n_grams[4],acceptance_datetime,original_filename] )
pg_client.close
puts "success! #{idx}"
end
rescue Timeout::Error
puts "skipped #{filename} because too slow"
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment